In [0]:
# please skip this grid as the experiment was conducted on Google Colaboratory.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# please skip this grid as the experiment was conducted on Google Colaboratory.
cd drive/My Drive/Colab Notebooks/AML-Coursework/IMDb

/content/drive/My Drive/Colab Notebooks/AML-Coursework/IMDb


#0. Import dependencies

In [0]:
import pandas as pd
import numpy as np
import nltk
import sklearn
import operator
import random
import math
nltk.download('stopwords') # If needed
nltk.download('punkt') # If needed
nltk.download('wordnet') # If needed

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#1. Load data

In [0]:
pos_train = pd.read_csv('train/imdb_train_pos.txt', sep="\n", header=None)
neg_train = pd.read_csv('train/imdb_train_neg.txt', sep="\n", header=None)
pos_train = pos_train.iloc[:,0].as_matrix()
neg_train = neg_train.iloc[:,0].as_matrix()

pos_dev = pd.read_csv('dev/imdb_dev_pos.txt', sep="\n", header=None)
neg_dev = pd.read_csv('dev/imdb_dev_neg.txt', sep="\n", header=None)
pos_dev = pos_dev.iloc[:,0].as_matrix()
neg_dev = neg_dev.iloc[:,0].as_matrix()

pos_test = pd.read_csv('test/imdb_test_pos.txt', sep="\n", header=None)
neg_test = pd.read_csv('test/imdb_test_neg.txt', sep="\n", header=None)
pos_test = pos_test.iloc[:,0].as_matrix()
neg_test = neg_test.iloc[:,0].as_matrix()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  if __name__ == '__main__':
  del sys.path[0]
  


In [0]:
print(pos_train)

['For fans of Chris Farley, this is probably his best film. David Spade plays the perfect cynical, sarcastic yin to Farley\'s "Baby Huey" yang. Farley achieves strokes of comic genius in his monologues, like the "Let\'s say you\'re driving along the road with your family..." bit, the "Jo-Jo the Idiot Circus Boy with a pretty new pet, (his possible sale)" speech, or the "Glue-sniffing Guarantee fairy" brake pad sale. The sappy moments in the film contrast sharply with Farley and Spade\'s shenanigans. Even after many viewings, it\'s still fun to see Farley pour everything he had into the role. "Richard, what\'s HAPPENING to me?!?!"'
 "Fantastic, Madonna at her finest, the film is funny and her acting is brilliant. It may have been made in the 80's but it has all the qualities of a modern Hollywood Block-buster. I love this film and i think its totally unique and will cheer up any droopy person within a matter of minutes. Fantastic."
 "From a perspective that it is possible to make movies

In [0]:
print(neg_train)

["A terrible deception: controversial film, winner of the Teddy in Berlin 2003, Mil nubes de paz turned out to be a fiasco. The actors are all reciting (well, they are not exactly actors); the film tried to be a high bet but ends up being a doubtful bet: it stays in the superficiality of two guys kissing and a guy whose lover is gone; it has no purpose: nothing to do with the homo-sexuality presented in other films (e.g. Before Night Falls (2000) by Julian Schnabel). Technically the only thing that works is the photography; otherwise, the camera is put in strange angles (to make it more `art-film') and the whole film runs in a black and white atmosphere. The film is so pretentious that bothers. I mean, it's good to be pretentious when you have talent to support it. Or maybe it is that it's so art-cinema that it's incomprehensible. The story flows slowly, slowly, slowly. To me, more form than essence. Superb edition? It was good. Superb direction? Don't think so: the film is weak. It wa

#2. Preprocessing and deriving the frequency-based feature (TF-IDF)

2.1 setup elements for tokenization

In [0]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_list_tokens(string):
  sentence_split=nltk.tokenize.sent_tokenize(string)
  list_tokens=[]
  for sentence in sentence_split:
    list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
  return list_tokens

2.2 setup stopwords and derive the vocabulary

In [0]:
# get the english stopwords list from nltk
stopwords=set(nltk.corpus.stopwords.words('english'))
# add more words to the stopword list
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")
stopwords.add("/")
stopwords.add(">")
stopwords.add("<")
stopwords.add("br")
stopwords.add("'s")
stopwords.add(")")
stopwords.add("(")
stopwords.add("''")

In [0]:
# Now we create a frequency dictionary with all words in the dataset
# This can take a few minutes depending on your computer, since we are processing more than ten thousand sentences

dict_word_frequency = {}

for pos_review in pos_train:
  sentence_tokens = get_list_tokens(pos_review)
  for word in sentence_tokens:
    if word in stopwords: continue
    if word not in dict_word_frequency: dict_word_frequency[word]=1
    else: dict_word_frequency[word]+=1

for neg_review in neg_train:
  sentence_tokens = get_list_tokens(neg_review)
  for word in sentence_tokens:
    if word in stopwords: continue
    if word not in dict_word_frequency: dict_word_frequency[word]=1
    else: dict_word_frequency[word]+=1

In [0]:
# Now we create a sorted frequency list with the top 1000 words, using the function "sorted". Let's see the 15 most frequent words
sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:1000]
i=0
for word, frequency in sorted_list[:15]:
  i+=1
  print (str(i)+". "+word+" - "+str(frequency))

1. movie - 29647
2. wa - 29577
3. film - 26929
4. n't - 19639
5. one - 15987
6. ! - 14847
7. like - 11876
8. ha - 9893
9. ? - 9593
10. time - 8589
11. good - 8376
12. character - 8318
13. would - 7867
14. ... - 7722
15. even - 7321


In [0]:
# Finally, we create our vocabulary based on the sorted frequency list 
vocabulary=[]
for word,frequency in sorted_list:
  vocabulary.append(word)

2.3 transform raw text to vector with the derived vocabulary

In [0]:
def get_vector_text(list_vocab,string):
  vector_text=np.zeros(len(list_vocab))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(word)
  return vector_text

In [0]:
X_train=[]
Y_train=[]
for pos_review in pos_train:
  vector_pos_review=get_vector_text(vocabulary,pos_review)
  X_train.append(vector_pos_review)
  Y_train.append(1)
for neg_review in neg_train:
  vector_neg_review=get_vector_text(vocabulary,neg_review)
  X_train.append(vector_neg_review)
  Y_train.append(0)

In [0]:
X_dev=[]
Y_dev=[]
for pos_review in pos_dev:
  vector_pos_review=get_vector_text(vocabulary,pos_review)
  X_dev.append(vector_pos_review)
  Y_dev.append(1)
for neg_review in neg_dev:
  vector_neg_review=get_vector_text(vocabulary,neg_review)
  X_dev.append(vector_neg_review)
  Y_dev.append(0)

In [0]:
X_test=[]
Y_test=[]
for pos_review in pos_test:
  vector_pos_review=get_vector_text(vocabulary,pos_review)
  X_test.append(vector_pos_review)
  Y_test.append(1)
for neg_review in neg_test:
  vector_neg_review=get_vector_text(vocabulary,neg_review)
  X_test.append(vector_neg_review)
  Y_test.append(0)

2.4 transform the raw frequence vectors to TF-IDF feature

In [0]:
TF_train = []
for f_vector in X_train:
  TF_train.append(f_vector/sum(f_vector))

X_train_temp = np.asarray(X_train)
IDF_vec = []
for i in range(X_train_temp.shape[1]):

  count_temp = 0
  for j in range(X_train_temp.shape[0]):
    if X_train_temp[j,i] != 0:
      count_temp += 1

  IDF_vec.append(math.log(X_train_temp.shape[0]/(count_temp+1)))

TF_IDF_train = []
for TF_vector in TF_train:
  TF_IDF_train.append(TF_vector * IDF_vec)

In [0]:
TF_dev = []
for f_vector in X_dev:
  TF_dev.append(f_vector/sum(f_vector))

TF_IDF_dev = []
for TF_vector in TF_dev:
  TF_IDF_dev.append(TF_vector * IDF_vec)

In [0]:
TF_test = []
for f_vector in X_test:
  TF_test.append(f_vector/sum(f_vector))

TF_IDF_test = []
for TF_vector in TF_test:
  TF_IDF_test.append(TF_vector * IDF_vec)

#3. Extraction of the 2nd feature - N-gram (N=2) 

3.1 learn 2-gram model from train set and extract 2-gram feature for the train set 

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

train_drop_stopwords = []
for pos_review in pos_train:
  sentence_tokens = get_list_tokens(pos_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  train_drop_stopwords.append(new_sentence) 

for neg_review in neg_train:
  sentence_tokens = get_list_tokens(neg_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  train_drop_stopwords.append(new_sentence)

twoGram = CountVectorizer(min_df=1, ngram_range=(2,2))
twoGram_train = twoGram.fit_transform(train_drop_stopwords)

3.2 extract 2-gram feature for the development set 

In [0]:
dev_drop_stopwords = []
for pos_review in pos_dev:
  sentence_tokens = get_list_tokens(pos_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  dev_drop_stopwords.append(new_sentence)

for neg_review in neg_dev:
  sentence_tokens = get_list_tokens(neg_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  dev_drop_stopwords.append(new_sentence)

twoGram_dev = twoGram.transform(dev_drop_stopwords)

3.3 extract 2-gram feature for the test set 

In [0]:
test_drop_stopwords = []
for pos_review in pos_test:
  sentence_tokens = get_list_tokens(pos_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  test_drop_stopwords.append(new_sentence)

for neg_review in neg_test:
  sentence_tokens = get_list_tokens(neg_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  test_drop_stopwords.append(new_sentence)

twoGram_test = twoGram.transform(test_drop_stopwords)

#4. Extraction of the 3nd feature - Word2vec 

4.1 learn Word2vec model from train set and extract Word2vec feature for the train set 

In [0]:
from gensim.models import Word2Vec

W2V_base = Word2Vec(train_drop_stopwords, min_count=5, size=500, workers=4)
W2V_train = []
for sentence in train_drop_stopwords:
  temp_vector = np.zeros(500)
  count = 0
  for word in sentence:
    try:
      temp_vector += W2V_base[word]
      count += 1
    except:
      pass
  W2V_train.append(temp_vector/count)

  # Remove the CWD from sys.path while we load stuff.


4.2 extract Word2vec feature for the development set

In [0]:
W2V_dev = []
for sentence in dev_drop_stopwords:
  temp_vector = np.zeros(500)
  count = 0
  for word in sentence:
    try:
      temp_vector += W2V_base[word]
      count += 1
    except:
      pass
  W2V_dev.append(temp_vector/count)

  import sys


4.3 extract Word2vec feature for the test set

In [0]:
W2V_test = []
for sentence in test_drop_stopwords:
  temp_vector = np.zeros(500)
  count = 0
  for word in sentence:
    try:
      temp_vector += W2V_base[word]
      count += 1
    except:
      pass
  W2V_test.append(temp_vector/count)

  import sys


#5. Feature selection

In [0]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

5.1 feature selection for TF-IDF feature

In [0]:
TF_IDF_train_fix = np.asarray(TF_IDF_train)
TF_IDF_dev_fix = np.asarray(TF_IDF_dev)
TF_IDF_test_fix = np.asarray(TF_IDF_test)

Y_train_fix = np.asarray(Y_train)
Y_dev_fix = np.asarray(Y_dev)
Y_test_fix = np.asarray(Y_test)

TF_IDF_select = SelectKBest(chi2, k=500).fit(TF_IDF_train_fix, Y_train_fix)
TF_IDF_train_selected = TF_IDF_select.transform(TF_IDF_train_fix)
TF_IDF_dev_selected = TF_IDF_select.transform(TF_IDF_dev_fix)
TF_IDF_test_selected = TF_IDF_select.transform(TF_IDF_test_fix)

print ("Size original training matrix: "+str(TF_IDF_train_fix.shape))
print ("Size new training matrix: "+str(TF_IDF_train_selected.shape))

Size original training matrix: (15000, 1000)
Size new training matrix: (15000, 500)


5.2 feature selection for 2-gram feature

In [0]:
twoGram_select = SelectKBest(chi2, k=1000).fit(twoGram_train, Y_train_fix)
twoGram_train_selected = twoGram_select.transform(twoGram_train)
twoGram_dev_selected = twoGram_select.transform(twoGram_dev)
twoGram_test_selected = twoGram_select.transform(twoGram_test)

print ("Size original training matrix: "+str(twoGram_train.shape))
print ("Size new training matrix: "+str(twoGram_train_selected.shape))

Size original training matrix: (15000, 325515)
Size new training matrix: (15000, 1000)


5.3 feature selection for Word2vec feature

In [0]:
W2V_train_fix = np.asarray(W2V_train)
W2V_dev_fix = np.asarray(W2V_dev)
W2V_test_fix = np.asarray(W2V_test)

W2V_select = SelectKBest(f_classif, k=300).fit(W2V_train_fix, Y_train_fix)
W2V_train_selected = W2V_select.transform(W2V_train_fix)
W2V_dev_selected = W2V_select.transform(W2V_dev_fix)
W2V_test_selected = W2V_select.transform(W2V_test_fix)

print ("Size original training matrix: "+str(W2V_train_fix.shape))
print ("Size new training matrix: "+str(W2V_train_selected.shape))

Size original training matrix: (15000, 500)
Size new training matrix: (15000, 300)


#6. Feature combination and further selection

In [0]:
twoGram_train_selected = np.asarray(twoGram_train_selected.todense())
twoGram_dev_selected = np.asarray(twoGram_dev_selected.todense())
twoGram_test_selected = np.asarray(twoGram_test_selected.todense())

In [0]:
COM_train = np.column_stack((TF_IDF_train_selected, twoGram_train_selected, W2V_train_selected))
COM_dev = np.column_stack((TF_IDF_dev_selected, twoGram_dev_selected, W2V_dev_selected))
COM_test = np.column_stack((TF_IDF_test_selected, twoGram_test_selected, W2V_test_selected))

COM_select = SelectKBest(f_classif, k=1000).fit(COM_train, Y_train_fix)
COM_train_selected = COM_select.transform(COM_train)
COM_dev_selected = COM_select.transform(COM_dev)
COM_test_selected = COM_select.transform(COM_test)

print ("Size original training matrix: "+str(COM_train.shape))
print ("Size new training matrix: "+str(COM_train_selected.shape))

Size original training matrix: (15000, 1800)
Size new training matrix: (15000, 1000)


#7. Train the classification model and evaluation on the development set

7.1 train a SVM model

In [0]:
svm_1st = sklearn.svm.SVC(kernel="linear",gamma='auto')
svm_1st.fit(COM_train_selected, Y_train_fix)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

7.2 predict on development set

In [0]:
Y_dev_pred = svm_1st.predict(COM_dev_selected)

7.3 evaluation with different indexes

In [0]:
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score

precision = precision_score(Y_dev_fix, Y_dev_pred, average='macro')
recall = recall_score(Y_dev_fix, Y_dev_pred, average='macro')
f1 = f1_score(Y_dev_fix, Y_dev_pred, average='macro')
accuracy = accuracy_score(Y_dev_fix, Y_dev_pred)

print(precision)
print(recall)
print(f1)
print(accuracy)

0.8379944543148083
0.837246122839008
0.8372778347071848
0.8374


#8. Modify the feature selection scheme and re-test on the development set

8.1 apply a new feature selection and combination scheme

In [0]:
COM_train_new = np.column_stack((TF_IDF_train, twoGram_train_selected, W2V_train))
COM_dev_new = np.column_stack((TF_IDF_dev, twoGram_dev_selected, W2V_dev))
COM_test_new = np.column_stack((TF_IDF_test, twoGram_test_selected, W2V_test))

COM_select_new = SelectKBest(f_classif, k=1000).fit(COM_train_new, Y_train_fix)
COM_train_selected_new = COM_select_new.transform(COM_train_new)
COM_dev_selected_new = COM_select_new.transform(COM_dev_new)
COM_test_selected_new = COM_select_new.transform(COM_test_new)

print ("Size original training matrix: "+str(COM_train_new.shape))
print ("Size new training matrix: "+str(COM_test_selected_new.shape))

Size original training matrix: (15000, 2500)
Size new training matrix: (5000, 1000)


8.2 train the 2nd model and evaluate it

In [0]:
svm_2nd = sklearn.svm.SVC(kernel="linear",gamma='auto')
svm_2nd.fit(COM_train_selected_new, Y_train_fix)

Y_dev_pred = svm_2nd.predict(COM_dev_selected_new)

precision = precision_score(Y_dev_fix, Y_dev_pred, average='macro')
recall = recall_score(Y_dev_fix, Y_dev_pred, average='macro')
f1 = f1_score(Y_dev_fix, Y_dev_pred, average='macro')
accuracy = accuracy_score(Y_dev_fix, Y_dev_pred)

print(precision)
print(recall)
print(f1)
print(accuracy)

0.8390555709534369
0.8382389743084282
0.8382695546920325
0.8384


#9. Evaluation on the test set and report the final results

9.1 evaluate the first model on test set

In [0]:
Y_test_pred = svm_1st.predict(COM_test_selected)

precision = precision_score(Y_test_fix, Y_test_pred, average='macro')
recall = recall_score(Y_test_fix, Y_test_pred, average='macro')
f1 = f1_score(Y_test_fix, Y_test_pred, average='macro')
accuracy = accuracy_score(Y_test_fix, Y_test_pred)

print(precision)
print(recall)
print(f1)
print(accuracy)

0.8367893692714989
0.8352137336341974
0.8350092707169487
0.8352


9.2 evaluate the second model on test set

In [0]:
Y_test_pred = svm_2nd.predict(COM_test_selected_new)

precision = precision_score(Y_test_fix, Y_test_pred, average='macro')
recall = recall_score(Y_test_fix, Y_test_pred, average='macro')
f1 = f1_score(Y_test_fix, Y_test_pred, average='macro')
accuracy = accuracy_score(Y_test_fix, Y_test_pred)

print(precision)
print(recall)
print(f1)
print(accuracy)

0.8359204737036885
0.8346125335380054
0.8344408977026923
0.8346
