## Sentiment analysis on the NoReC dataset, using support vector machine

In [1]:
import pandas as pd
import numpy as np
import csv
import re
import pickle
import time

# Run these if you cannot import the nltk libs.
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC

import matplotlib.pyplot as plt

  return f(*args, **kwds)
  return f(*args, **kwds)


## Load in data

In [2]:
articles = pd.read_csv('data/data.csv', encoding='utf-8')
# articles['content_split'] = articles.loc[:, 'content'].apply(lambda x: x.split())
articles['sentiment'] = [(1 if rating > 3 else 0) for rating in articles.loc[:, 'rating']]
articles['three_sentiment'] = [(1 if rating > 4 else 0 if rating < 3 else 2) for rating in articles.loc[:, 'rating']]


In [3]:
print(len(articles), 'records')
articles.head()

43614 records


Unnamed: 0,category,cons,language,pros,rating,source,source-category,source-tags,split,tags,content,sentiment,three_sentiment
0,screen,,nb,,6,p3,tv,[],train,['tv'],rom s topp inn tvdram akkurat andr sist sesong...,1,1
1,screen,,nb,,6,p3,tv,[],train,['tv'],twin peaks definitiv gold box edition gull twi...,1,1
2,screen,,nb,,6,p3,tv,[],train,['tv'],the wir sesong the wir gjør avheng god måt nes...,1,1
3,screen,,nb,,5,p3,tv,[],train,['tv'],mad sesong stil underhold sofistiker tvseri ma...,1,1
4,screen,,nb,,5,p3,film,[],train,['movie'],mad sesong tvunderholdning høyest kvalit først...,1,1


### Split into train & test
Using Support Vector Classifier + Count Vectorizer

In [4]:
articles.sentiment.value_counts() # 1 = positive, 0 = negative. As we can see, much larger amount of pos. reviews!

1    32910
0    10704
Name: sentiment, dtype: int64

In [10]:
cv = CountVectorizer()
review_content_tf = cv.fit_transform(articles.content)

In [11]:
review_content_tf.shape

(43614, 368212)

In [None]:
tf_idf_cv = TfidfVectorizer()
review_content_tf = tf_idf_cv.fit_transform(articles.content)

#### Gamma-function = "auto", kernel = "rbf"

In [12]:
trainX, testX, trainY, testY = train_test_split(review_content_tf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_rbf_auto = SVC(gamma = "auto")
svc_rbf_auto.fit(trainX, trainY)

t1 = time.time() 
y_pred = svc_rbf_auto.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 75.285%.
Training time: 1185.5434720516205 
Testing time: 358.3650348186493


#### Gamma-function = "scale", kernel = "rbf"

In [13]:
trainX, testX, trainY, testY = train_test_split(review_content_tf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_rbf_scale = SVC(gamma = "scale", kernel = "rbf")
svc_rbf_scale.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_rbf_scale.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 76.217%.
Training time: 1089.7594327926636 
Testing time: 371.46812915802


#### Gamma-function = "auto", kernel = "linear"

In [17]:
trainX, testX, trainY, testY = train_test_split(review_content_tf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_linear = SVC(gamma = "auto", kernel = "linear")
svc_linear.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_linear.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 81.735%.
Training time: 6771.545135259628 
Testing time: 240.38828945159912


### TF-IDF vectorizer with all kernels

In [5]:
tf_idf_cv = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)
review_content_tf_idf = tf_idf_cv.fit_transform(articles.content)

In [6]:
review_content_tf_idf.shape

(43614, 65793)

In [7]:
trainX, testX, trainY, testY = train_test_split(review_content_tf_idf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_linear_tf_idf = SVC(gamma = "auto", kernel = "linear")
svc_linear_tf_idf.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_linear_tf_idf.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 84.723%.
Training time: 1721.595618724823 
Testing time: 285.8620102405548


In [13]:
confusion_matrix(testY, y_pred, normalize="true")

array([[0.54141475, 0.45858525],
       [0.04929938, 0.95070062]])

#### Gamma-function = "scale", kernel = "rbf"

In [21]:
trainX, testX, trainY, testY = train_test_split(review_content_tf_idf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_rbf_scale = SVC(gamma = "scale", kernel = "rbf")
svc_rbf_scale.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_rbf_scale.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 75.231%.
Training time: 1217.4950003623962 
Testing time: 339.579998254776


#### Gamma-function = "auto", kernel = "poly"

In [22]:
trainX, testX, trainY, testY = train_test_split(review_content_tf_idf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_poly_auto = SVC(gamma = "auto", kernel = "poly")
svc_poly_auto.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_poly_auto.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 75.438%.
Training time: 695.379992723465 
Testing time: 289.2979452610016


#### Gamma-function = "scale", kernel = "poly"

In [23]:
trainX, testX, trainY, testY = train_test_split(review_content_tf_idf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_poly_scale = SVC(gamma = "scale", kernel = "poly")
svc_poly_scale.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_poly_scale.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 75.827%.
Training time: 699.3269975185394 
Testing time: 290.5860013961792


#### Gamma-function = "auto", kernel = "sigmoid"

In [24]:
trainX, testX, trainY, testY = train_test_split(review_content_tf_idf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_sigmoid_auto = SVC(gamma = "auto", kernel = "sigmoid")
svc_sigmoid_auto.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_sigmoid_auto.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 75.529%.
Training time: 714.6609628200531 
Testing time: 296.1706907749176


#### Gamma-function = "scale", kernel = "sigmoid"

In [25]:
trainX, testX, trainY, testY = train_test_split(review_content_tf_idf, articles.sentiment, test_size=0.3)

t0 = time.time()
svc_sigmoid_scale = SVC(gamma = "scale", kernel = "sigmoid")
svc_sigmoid_scale.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_sigmoid_scale.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 75.430%.
Training time: 1152.581962108612 
Testing time: 314.5415563583374


### Multiclass solution

In [26]:
tf_idf_cv = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)
review_content_tf_idf = tf_idf_cv.fit_transform(articles.content)

In [27]:
review_content_tf_idf.shape

(43614, 65792)

In [28]:
trainX, testX, trainY, testY = train_test_split(review_content_tf_idf, articles.three_sentiment, test_size=0.3)

t0 = time.time()
svc_linear_multiclass = SVC(gamma = "auto", kernel = "linear", decision_function_shape='ovo')
svc_linear_multiclass.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_linear_multiclass.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 71.257%.
Training time: 2645.7904319763184 
Testing time: 472.92552042007446


In [29]:
trainX, testX, trainY, testY = train_test_split(review_content_tf_idf, articles.three_sentiment, test_size=0.3)

t0 = time.time()
svc_linear_multiclass = SVC(gamma = "auto", kernel = "linear", decision_function_shape='ovr')
svc_linear_multiclass.fit(trainX, trainY)

t1 = time.time()
y_pred = svc_linear_multiclass.predict(testX)
t2 = time.time()

print("Out of", len(testY), 'points, our accuracy were {:.03f}%.'.format(100*(1-(y_pred != testY).sum()/len(testY))))
print("Training time:", t1 - t0, "\nTesting time:", t2 - t1)

Out of 13085 points, our accuracy were 71.716%.
Training time: 2647.3154096603394 
Testing time: 477.9991874694824


### Download all the models

In [42]:
models = [svc_rbf_auto, svc_rbf_scale, svc_linear, svc_linear_tf_idf, svc_poly_auto, svc_poly_scale, svc_sigmoid_auto, svc_sigmoid_scale, svc_linear_multiclass]
model_names = ["svc_rbf_auto", "svc_rbf_scale", "svc_linear", "svc_linear_tf_idf", "svc_poly_auto_tf_idf", "svc_poly_scale_tf_idf", "svc_sigmoid_auto_tf_idf", "svc_sigmoid_scale_tf_idf", "svc_linear_multiclass_tf_idf"]

for idx, model in enumerate(models):
    pickle_file = open("data/models/" + model_names[idx], "wb")
    pickle.dump(model, pickle_file)
    pickle_file.close()

### Testing the model

In [35]:
word_stemmer = SnowballStemmer("norwegian", ignore_stopwords=True)
excludedStopWords = set(['ikkje', 'ikke', 'inkje'])
stopWords = set([word for word in set(stopwords.words('norwegian')) if word not in excludedStopWords])
FEATURES = tf_idf_cv.get_feature_names()

In [31]:
def predictReview(text, classifier):
    """
    Quickly classify test reviews using our model.
    Do some quick preprocessing, (copy pasted from preprocessing.ipynb....)
    """
    text = text.strip().lower()
    text = re.sub(r'[^a-zA-ZæøåÆØÅéäöÄÖ -!?]+', '', text) # Remove any symbols
    text = re.sub(r'\s\s+', ' ', text) # Remove consequent whitespace    
    text = [word_stemmer.stem(word) for word in word_tokenize(text) if word not in stopWords]
    
    word_count = {w:0 for w in FEATURES}
    for w in text:
        if w in word_count:
            word_count[w] += 1
    
    text_tf = np.array([[v for _,v in word_count.items()]])
    res = classifier.predict(text_tf)
    return res[0]

In [37]:
test_reviews = [
    'dårlig ikke elendig søppel!',
    'ikke gale!',
    'den trenger litt finpuss men ellers helt fin',
    'den falt ikke i min smak, håper på at sesong 3 blir bedre',
    'det kan ikke bli værre musikk en dette her',
]

In [38]:
[predictReview(review, svc) for review in test_reviews] 

[1, 1, 1, 1, 1]