In [1]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading https://files.pythonhosted.org/packages/86/9e/c53e1fc61aac5ee490a6ac5e21b1ac04e55a7c2aba647bb8411c9aadf24e/vaderSentiment-3.2.1-py2.py3-none-any.whl (125kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.2.1


### Import libraries and data

In [110]:
import matplotlib.pyplot as plt
import numpy as np
import nltk
import sklearn
import scipy
import pandas as pd
import math
import operator
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pickle
import time
import sys

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

f_train_pos = open('IMDb/train/imdb_train_pos.txt','r', encoding="UTF-8")
f_train_neg = open('IMDb/train/imdb_train_neg.txt','r', encoding="UTF-8")

f_test_pos = open('IMDb/test/imdb_test_pos.txt','r', encoding="UTF-8")
f_test_neg = open('IMDb/test/imdb_test_neg.txt','r', encoding="UTF-8")

f_dev_pos = open('IMDb/dev/imdb_dev_pos.txt','r', encoding="UTF-8")
f_dev_neg = open('IMDb/dev/imdb_dev_neg.txt','r', encoding="UTF-8")

p_in = open('positive.txt','r')
n_in = open('negative.txt','r')

pos_words = [x.strip("\n") for x in p_in]
neg_words = [x.strip("\n") for x in n_in]

train_pos = []
train_neg = []
test_pos = []
test_neg = []
dev_pos = []
dev_neg = []

for line in f_train_pos:
    train_pos.append(line)
for line in f_train_neg:
    train_neg.append(line)
for line in f_test_pos:
    test_pos.append(line)
for line in f_test_neg:
    test_neg.append(line)
for line in f_dev_pos:
    dev_pos.append(line)
for line in f_dev_neg:
    dev_neg.append(line)
    

train_pos = preprocess_reviews(train_pos)
train_neg = preprocess_reviews(train_neg)
test_pos = preprocess_reviews(test_pos)
test_neg = preprocess_reviews(test_neg)
dev_pos = preprocess_reviews(dev_pos)
dev_neg = preprocess_reviews(dev_neg)

train_set = []
test_set = []
dev_set = []

train_set += [(x,1) for x in train_pos]
train_set += [(x,0) for x in train_neg]
test_set += [(x,1) for x in test_pos]
test_set += [(x,0) for x in test_neg]
dev_set += [(x,1) for x in dev_pos]
dev_set += [(x,0) for x in dev_neg]

### Global functions

In [40]:
def get_list_tokens(string):
    sentence_split=nltk.tokenize.sent_tokenize(string)
    list_tokens=[]
    for sentence in sentence_split:
        list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
        for token in list_tokens_sentence:
            list_tokens.append(lemmatizer.lemmatize(token).lower())
            
    return list_tokens

def get_lemmatizer():
    return nltk.stem.WordNetLemmatizer()

def get_stopwords():
    stopwords=set(nltk.corpus.stopwords.words('english'))
    # We can add more words to the stopword list, like punctuation marks
    #stopwords.add(".")
    #stopwords.add(",")
    #stopwords.add("#")
    #stopwords.add("@")
    #stopwords.add(":")
    #stopwords.add("--")
    #stopwords.add("``")
    #stopwords.add("!")
    #stopwords.add("?")
    #stopwords.add("...")
    #stopwords.add("&")
    #stopwords.add("-")
    #stopwords.add(";")
    #stopwords.add("'")
    #stopwords.add("#")
    #stopwords.add("’")
    return stopwords
    
def get_vocabulary(training_set, num_features): # Function to retrieve vocabulary
    dict_word_frequency={}
    for instance in training_set:
        sentence_tokens=get_list_tokens(instance[0])
        for word in sentence_tokens:
            if word in stopwords: continue
            if word not in dict_word_frequency: dict_word_frequency[word]=1
            else: dict_word_frequency[word]+=1
    sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:num_features]
    vocabulary=[]
    for word,frequency in sorted_list:
        vocabulary.append(word)
    return vocabulary

def get_vector_text(list_vocab, string):
    vector_text=np.zeros(len(list_vocab))
    list_tokens_string=get_list_tokens(string)
    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)
    return vector_text

def get_vector_text_pos_neg(list_vocab, string):
    vector_text=np.zeros(len(list_vocab)+2)
    list_tokens_string=get_list_tokens(string)
    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)
    vector_text[i+1] = pos_word_count(list_tokens_string)
    vector_text[i+2] = neg_word_count(list_tokens_string)
    return vector_text

def get_vector_text_all(list_vocab, string):
    vector_text=np.zeros(len(list_vocab)+4)
    list_tokens_string=get_list_tokens(string)
    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)
    p_scores = vader.polarity_scores(list_to_sentance(list_tokens_string))
    vector_text[i+1] = p_scores['neg']
    vector_text[i+2] = p_scores['neu']
    vector_text[i+3] = p_scores['pos']
    vector_text[i+4] = p_scores['compound']
    return vector_text

def list_to_sentance(list_string):
    str_rtn = ""
    for word in list_string:
        str_rtn += word + " "
    return str_rtn


def pos_word_count(tokens):
    count = 0
    for word in tokens:
        if word in pos_words:
            count += 1
    return count

def neg_word_count(tokens):
    count = 0
    for word in tokens:
        if word in neg_words:
            count += 1
    return count

### Global variables

In [103]:
lemmatizer = get_lemmatizer()
stopwords = get_stopwords()
vocabulary = get_vocabulary(train_set, 2000)
vader = SentimentIntensityAnalyzer()
Y_train = [x[1] for x in train_set]
Y_test = [x[1] for x in test_set]
Y_dev = [x[1] for x in dev_set]

### Vectorize train and test sets

In [104]:
# vector count and VADER analysis
Xvec = [(get_vector_text_all(vocabulary, x[0]), x[1]) for x in train_set]
Xvec_test = [(get_vector_text_all(vocabulary, x[0]), x[1]) for x in test_set]
Xvec_dev = [(get_vector_text_all(vocabulary, x[0]), x[1]) for x in dev_set]

# TF-IDF vectorisation
tfidf_vec = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=True, max_features=500)
tfX = tfidf_vec.fit_transform(train_pos+train_neg)
tfX_test = tfidf_vec.transform(test_pos+test_neg)
tfX_dev = tfidf_vec.transform(dev_pos+dev_neg)

# combining features
tfX_reshape = scipy.sparse.csr_matrix.toarray(tfX)
tfX_test_reshape = scipy.sparse.csr_matrix.toarray(tfX_test)
tfX_dev_reshape = scipy.sparse.csr_matrix.toarray(tfX_dev)

Xvec_all = Xvec.copy()
Xvec_all_std = Xvec.copy()
Xvec_all_test = Xvec_test.copy()
Xvec_all_test_std = Xvec_test.copy()
Xvec_all_dev = Xvec_dev.copy()
Xvec_all_dev_std = Xvec_dev.copy()

for i in range(0, len(tfX_reshape)):
    Xvec_all[i] = (np.append(Xvec_all[i][0], np.asarray(tfX_reshape[i])), Xvec_all[i][1])
for i in range(0, len(tfX_test_reshape)):
    Xvec_all_test[i] = np.append(Xvec_all_test[i][0], np.asarray(tfX_test_reshape[i]))
for i in range(0, len(tfX_dev_reshape)):
    Xvec_all_dev[i] = np.append(Xvec_all_dev[i][0], np.asarray(tfX_dev_reshape[i]))
    
scaler = sklearn.preprocessing.StandardScaler()    
nx_all = [x[0] for x in Xvec_all]
std_x = scaler.fit_transform(nx_all)
std_x_test = scaler.transform(Xvec_all_test)
std_x_dev = scaler.transform(Xvec_all_dev)

pca_transformer = sklearn.decomposition.PCA(n_components=20)
pca_x = pca_transformer.fit_transform(std_x)
pca_x_test = pca_transformer.transform(std_x_test)
pca_x_dev = pca_transformer.transform(std_x_dev)

### Train SVMs

In [105]:
#svm_clf = train_svm_vector_classifier(std_x)
svm_clf = sklearn.svm.SVC(kernel='rbf', gamma='scale', C=0.8)
t_start_std = time.perf_counter()
svm_clf.fit(std_x, Y_train)
t_std = time.perf_counter() - t_start_std

svm_clf_pca = sklearn.svm.SVC(kernel='rbf', gamma='scale', C=0.8)
t_start_pca = time.perf_counter()
svm_clf_pca.fit(pca_x, Y_train)
t_pca = time.perf_counter() - t_start_pca

### Make predictions

In [106]:
t = time.perf_counter()
preds = svm_clf.predict(std_x_test)
t_std_pred = time.perf_counter() - t

t = time.perf_counter()
preds_pca = svm_clf_pca.predict(pca_x_test)
t_pca_pred = time.perf_counter() - t

### Show metrics

In [107]:
print(sklearn.metrics.classification_report(Y_test, preds))
print()
print(sklearn.metrics.classification_report(Y_test, preds_pca))
print()
print("Without PCA learn t=",t_std,"   predict t=",t_std_pred)
print("With PCA learn t=",t_pca,"   predict t=",t_pca_pred)

              precision    recall  f1-score   support

           0       0.90      0.85      0.87      2501
           1       0.86      0.91      0.88      2499

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000


              precision    recall  f1-score   support

           0       0.88      0.85      0.86      2501
           1       0.85      0.89      0.87      2499

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000


Without PCA learn t= 509.2410146000002    predict t= 99.98504760000105
With PCA learn t= 4.074947699999029    predict t= 0.7835511999983282


### Dev set parameter optimisation

- PCA components (show time vs accuracy)
- SVM C parameter
- Possibly vocab feature size 

### PCA components

In [None]:
comp_list = [5,10,20,50,100,500,1000]
acc_list_comp = []
t_learn_list_comp = []
t_pred_list_comp = []

for n in comp_list:

    pca_transformer = sklearn.decomposition.PCA(n_components=n)
    pca_x = pca_transformer.fit_transform(std_x)
    #pca_x_test = pca_transformer.transform(std_x_test)
    pca_x_dev = pca_transformer.transform(std_x_dev)

    svm_clf_pca = sklearn.svm.SVC(kernel='rbf', gamma='scale')
    t_start_pca = time.perf_counter()
    svm_clf_pca.fit(pca_x, Y_train)
    t_pca = time.perf_counter() - t_start_pca

    t = time.perf_counter()
    preds_pca = svm_clf_pca.predict(pca_x_dev)
    t_pca_pred = time.perf_counter() - t
    
    acc_list_comp.append(sklearn.metrics.accuracy_score(Y_dev, preds_pca))
    t_learn_list_comp.append(t_pca)
    t_pred_list_comp.append(t_pca_pred)
    
    print(n)

### SVM C regularisation parameter 

In [None]:
c_list = [2.0,1.5,1.0,0.8,0.6,0.4,0.2,0.1]
acc_list_c = []
t_learn_list_c = []

pca_transformer = sklearn.decomposition.PCA(n_components=20)
pca_x = pca_transformer.fit_transform(std_x)
pca_x_dev = pca_transformer.transform(std_x_dev)

for c in c_list:
    svm_clf_pca = sklearn.svm.SVC(kernel='rbf', gamma='scale', C=c)
    t_start_pca = time.perf_counter()
    svm_clf_pca.fit(pca_x, Y_train)
    t_pca = time.perf_counter() - t_start_pca

    t = time.perf_counter()
    preds_pca = svm_clf_pca.predict(pca_x_dev)
    t_pca_pred = time.perf_counter() - t
    
    acc_list_c.append(sklearn.metrics.accuracy_score(Y_dev, preds_pca))
    t_learn_list_c.append(t_pca)
    
    print(c)

In [None]:
vocab_list = [100,200,500,1000,2000]
acc_list_voc = []
t_vectorize_list_voc = []
t_learn_list_voc = []

for vf in vocab_list:
    t = time.perf_counter()
    vocabulary = get_vocabulary(train_set, vf)
    # vector count and VADER analysis
    Xvec = [(get_vector_text_all(vocabulary, x[0]), x[1]) for x in train_set]
    Xvec_dev = [(get_vector_text_all(vocabulary, x[0]), x[1]) for x in dev_set]

    # TF-IDF vectorisation
    tfidf_vec = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=True, max_features=500)
    tfX = tfidf_vec.fit_transform(train_pos+train_neg)
    tfX_dev = tfidf_vec.transform(dev_pos+dev_neg)

    # combining features
    tfX_reshape = scipy.sparse.csr_matrix.toarray(tfX)
    tfX_dev_reshape = scipy.sparse.csr_matrix.toarray(tfX_dev)

    Xvec_all = Xvec.copy()
    Xvec_all_std = Xvec.copy()
    Xvec_all_dev = Xvec_dev.copy()
    Xvec_all_dev_std = Xvec_dev.copy()

    for i in range(0, len(tfX_reshape)):
        Xvec_all[i] = (np.append(Xvec_all[i][0], np.asarray(tfX_reshape[i])), Xvec_all[i][1])
    for i in range(0, len(tfX_dev_reshape)):
        Xvec_all_dev[i] = np.append(Xvec_all_dev[i][0], np.asarray(tfX_dev_reshape[i]))
    
    scaler = sklearn.preprocessing.StandardScaler()    
    nx_all = [x[0] for x in Xvec_all]
    std_x = scaler.fit_transform(nx_all)
    std_x_dev = scaler.transform(Xvec_all_dev)

    pca_transformer = sklearn.decomposition.PCA(n_components=20)
    pca_x = pca_transformer.fit_transform(std_x)
    pca_x_dev = pca_transformer.transform(std_x_dev)
    
    t_vec = time.perf_counter() - t
    
    svm_clf_pca = sklearn.svm.SVC(kernel='rbf', gamma='scale', C=0.8)
    t = time.perf_counter()
    svm_clf_pca.fit(pca_x, Y_train)
    t_pca = time.perf_counter() - t

    t = time.perf_counter()
    preds_pca = svm_clf_pca.predict(pca_x_dev)
    t_pca_pred = time.perf_counter() - t
    
    acc_list_voc.append(sklearn.metrics.accuracy_score(Y_dev, preds_pca))
    t_learn_list_voc.append(t_pca)
    t_vectorize_list_voc.append(t_vec)
    
    print(vf)

### Save results

In [None]:
def write_file(l, str_file):
    f = open(str_file, 'w')
    f.writelines([str(i) + "\n" for i in l])
    
write_file(comp_list, 'Dev_results/comp_list.txt')
write_file(c_list, 'Dev_results/c_list.txt')
write_file(vocab_list, 'Dev_results/vocab_list.txt')

write_file(acc_list_comp, 'Dev_results/acc_list_comp.txt')
write_file(acc_list_c, 'Dev_results/acc_list_c.txt')
write_file(acc_list_voc, 'Dev_results/acc_list_voc.txt')

write_file(t_learn_list_comp, 'Dev_results/t_learn_list_comp.txt')
write_file(t_learn_list_c, 'Dev_results/t_learn_list_c.txt')
write_file(t_learn_list_voc, 'Dev_results/t_learn_list_voc.txt')
write_file(t_vectorize_list_voc, 'Dev_results/t_vectorize_list_voc.txt')