# COMP 551: Applied Machine Learning
## Assignment 3: Sentiment Classification - Yelp & IMDB
### Author: Antonios Valkanas

In [1]:
# Import useful libraries
import numpy as np
import pandas as pd
import string, random, os
import sklearn.naive_bayes
from sklearn import svm, metrics
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import CountVectorizer

### Question 1
Most of the algorithms described in the class expects input as a vector. However, the reviews are natural language text of varying number of words. So the first step would be to convert this varying length movie review to a fixed length vector representation. We will consider two different ways of vectorizing the natural language text: binary bag-of-words representation and frequency bag-of-words representation (as explained in the end of the assignment). Convert both the datasets into both these representations and turn in the converted datasets. Instruction for dataset format is given in the end of the assignment (do not include the dataset in the printed report).

In [2]:
# Read data with pandas reading in from file with utf-8.
def pre_process (file_name):
    temp = []
    with open(file_name, encoding="UTF-8") as f:
        for line in f.readlines():
            full_line = line.split("\t")
            comment_score = (full_line[1])
            comment_text = str.lower(full_line[0]).translate(str.maketrans('', '', string.punctuation))
            temp.append([comment_text,comment_score])
    return np.asarray(temp)

yelp_train = pd.DataFrame(pre_process("Datasets/yelp-train.txt")).rename(columns={0: "review", 1: "rating"})
yelp_valid = pd.DataFrame(pre_process("Datasets/yelp-valid.txt")).rename(columns={0: "review", 1: "rating"})
yelp_test = pd.DataFrame(pre_process("Datasets/yelp-test.txt")).rename(columns={0: "review", 1: "rating"})

imdb_train = pd.DataFrame(pre_process("Datasets/imdb-train.txt")).rename(columns={0: "review", 1: "rating"})
imdb_valid = pd.DataFrame(pre_process("Datasets/imdb-valid.txt")).rename(columns={0: "review", 1: "rating"})
imdb_test = pd.DataFrame(pre_process("Datasets/imdb-test.txt")).rename(columns={0: "review", 1: "rating"})


count_vectorizer_yelp = CountVectorizer(max_features = 10000, binary = True) 
count_vectorizer_imdb = CountVectorizer(max_features = 10000, binary = True) 


# Vectorize datasets
vectorized_yelp_train = count_vectorizer_yelp.fit_transform(yelp_train['review'])
vectorized_yelp_test = count_vectorizer_yelp.transform(yelp_test['review'])
vectorized_yelp_valid = count_vectorizer_yelp.transform(yelp_valid['review'])
vectorized_imdb_train = count_vectorizer_imdb.fit_transform(imdb_train['review'])
vectorized_imdb_test = count_vectorizer_imdb.transform(imdb_test['review'])
vectorized_imdb_valid = count_vectorizer_imdb.transform(imdb_valid['review'])

In [3]:
yelp_vocabulary = count_vectorizer_yelp.get_feature_names()
# From https://stackoverflow.com/questions/27488446/how-do-i-get-word-frequency-in-a-corpus-using-scikit-learn-countvectorizer/27488756
# Retrieved how to use cv_fit.sum(axis = 0) as a quick way to count the word frequency.
yelp_frequency = np.asarray(vectorized_yelp_train.sum(axis=0))
yelp_frequency = np.ndarray.flatten(yelp_frequency)

imdb_vocabulary = count_vectorizer_imdb.get_feature_names()
imdb_frequency = np.asarray(vectorized_imdb_train.sum(axis=0))
imdb_frequency = np.ndarray.flatten(imdb_frequency)

with open("yelp-vocab.txt",'w+') as f:
    for i in range(10000):
        f.write("{}\t{}\t{}\n".format(yelp_vocabulary[i],i,yelp_frequency[i]))
        
with open("imdb-vocab.txt",'w+') as f:
    for i in range(10000):
        f.write("{}\t{}\t{}\n".format(imdb_vocabulary[i],i,imdb_frequency[i]))

yelp_vocab_dict = count_vectorizer_yelp.vocabulary_  
imdb_vocab_dict = count_vectorizer_imdb.vocabulary_ 

def write_file(filename, df, dictionary):
    with open(filename,'w+') as f:
        i = -1
        for text, rating in zip(df['review'], df['rating']):
            review = ""
            i += 1
            words = text.split()
            for word in words:
                this_word = dictionary.get(word,None)           
                if this_word is not None:
                    review += str(this_word)+ " "                
            f.write("{}\t{}\n".format(review, rating))
            
write_file("yelp-train.txt", yelp_train, yelp_vocab_dict)
write_file("yelp-valid.txt", yelp_valid, yelp_vocab_dict)
write_file("yelp-test.txt", yelp_test, yelp_vocab_dict)

write_file("imdb-train.txt", imdb_train, imdb_vocab_dict)
write_file("imdb-valid.txt", imdb_valid, imdb_vocab_dict)
write_file("imdb-test.txt", imdb_test, imdb_vocab_dict)

### Question 2
For this question, we will focus on the yelp dataset with binary bag-of-words (BBoW) representation. We will use the F1-measure as the evaluation metric for the entire assignment.
* As a baseline, report the performance of the random classifier (a classifier which classifies a review into an uniformly random class) and the majority-class classifier (a classifier which computes the majority class in the training set and classifies all test instances as that majority class).
* Now train Naive Bayes, Decision Trees, and Linear SVM for this task. Note: You should do a thorough hyper-parameter tuning by using the given validation set. Also, note that you should use the appropriate naive Bayes classifier for binary input features (also called Bernoulli naive Bayes).
* Report the list of hyper-parameters you considered for each classifier, the range of the individual hyper-parameters and the best value for these hyper-parameters chosen based on the validation set performance1.
* Report training, validation, and test F1-measure for all the classifiers (with best hyper-parameter configuration).
* Comment about the performance of different classifiers. Why did a particular classifier performe better than the others? What was the role of that hyperparameter that fetched you the best results.

In [4]:
# Classifiers

yelp_train['rating'] = yelp_train['rating'].astype(np.int32)
yelp_valid['rating'] = yelp_valid['rating'].astype(int)
yelp_test['rating'] = yelp_test['rating'].astype(int)

imdb_train['rating'] = imdb_train['rating'].astype(int)
imdb_valid['rating'] = imdb_valid['rating'].astype(int)
imdb_test['rating'] = imdb_test['rating'].astype(int)

# Dummy
random_dummy = DummyClassifier(strategy = 'uniform')
majority_dummy = DummyClassifier(strategy = 'most_frequent')

# Naive-Bayes, SVM, Tree
# Using Grid search as proposed in: https://stackoverflow.com/questions/33830959/multinomial-naive-bayes-parameter-alpha-setting-scikit-learn
nb_params = ParameterGrid({'alpha':[1e-5, 1e-4, 1e-3,0.01,0.1,0.3,0.6,1]})
tree_params = ParameterGrid({'criterion':['gini','entropy'],'max_depth':[None,5,10,100,500],'min_samples_split':[3,5,10,15,20]})
svm_params = ParameterGrid({'loss':['hinge','squared_hinge'],'C':[0.5,1.0,1.5,2.0,5.0,10.0,100.0]})

classifiers = [(BernoulliNB, nb_params), (DecisionTreeClassifier, tree_params), (svm.LinearSVC, svm_params)]

In [5]:
def model_validation(model, yelp_dataset = True):
    if yelp_dataset:
        model.fit(vectorized_yelp_train, yelp_train['rating'].astype(int))
        valid = model.predict(vectorized_yelp_valid)
        f1_valid = f1_score(yelp_valid['rating'].astype(int), valid, average='micro')
    else:
        model.fit(vectorized_imdb_train, imdb_train['rating'].astype(int))
        valid = model.predict(vectorized_imdb_valid)
        f1_valid = f1_score(imdb_valid['rating'].astype(int), valid, average='binary')
    return f1_valid

def model_train_valid_test(model, yelp_dataset = True):
    if yelp_dataset:
        model.fit(vectorized_yelp_train, yelp_train['rating'].astype(int))
        train = model.predict(vectorized_yelp_train)
        valid = model.predict(vectorized_yelp_valid)
        test = model.predict(vectorized_yelp_test)
        f1_train = f1_score(yelp_train['rating'].astype(int), train, average='micro')
        f1_valid = f1_score(yelp_valid['rating'].astype(int), valid, average='micro')
        f1_test = f1_score(yelp_test['rating'].astype(int), test, average='micro')
    else:
        model.fit(vectorized_imdb_train, imdb_train['rating'].astype(int))
        train = model.predict(vectorized_imdb_train)
        valid = model.predict(vectorized_imdb_valid)
        test = model.predict(vectorized_imdb_test)
        f1_train = f1_score(imdb_train['rating'].astype(int), train, average='binary')
        f1_valid = f1_score(imdb_valid['rating'].astype(int), valid, average='binary')
        f1_test = f1_score(imdb_test['rating'].astype(int), test, average='binary')
    return f1_train, f1_valid, f1_test

In [6]:
# find best params for a classifier 
def optimize_parameters(classifier, grid, yelp = True):
    best_f1 = 0.0
    optimal_params = None
    for params in grid:
        print("Test for:", params)
        f1 = model_validation(classifier(**params), yelp)
        print("F1 :{}\n".format(f1))
        if f1>best_f1:
            best_f1=f1
            optimal_params=params       
    print("Best params: {}".format(optimal_params))
    print("Best F1 Score: {}\n".format(best_f1))
    return classifier(**optimal_params)

print('Classifier Tests\n')
print('Random Dummy Classifier')
print(model_validation(random_dummy))
print('Majority Dummy Classifier')
print(model_validation(majority_dummy))
print('Naive Bayes Classifier')
print('Decision Tree Classifier')
print('SVM Classifier')

for param in classifiers: # cycle through the classifiers and parameters
    clf = param[0]
    param_grid = param[1]
    print(clf)    
    best_clf = optimize_parameters(clf,param_grid, True) 

Classifier Tests

Random Dummy Classifier
0.185
Majority Dummy Classifier
0.356
Naive Bayes Classifier
Decision Tree Classifier
SVM Classifier
<class 'sklearn.naive_bayes.BernoulliNB'>
Test for: {'alpha': 1e-05}
F1 Score Validation:0.404

Test for: {'alpha': 0.0001}
F1 Score Validation:0.412

Test for: {'alpha': 0.001}
F1 Score Validation:0.426

Test for: {'alpha': 0.01}
F1 Score Validation:0.428

Test for: {'alpha': 0.1}
F1 Score Validation:0.41

Test for: {'alpha': 0.3}
F1 Score Validation:0.398

Test for: {'alpha': 0.6}
F1 Score Validation:0.394

Test for: {'alpha': 1}
F1 Score Validation:0.388

Best params for Validation: {'alpha': 0.01}
Best F1 Score on Validation: 0.428

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Test for: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 3}
F1 Score Validation:0.31

Test for: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 5}
F1 Score Validation:0.309

Test for: {'criterion': 'gini', 'max_depth': None, 'min_samp

In [7]:
# Optimal Parameters derived from above:
# NB: {'aplha': 0.01}
# Tree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 10}
# Linear SVC: {'C': 0.5, 'loss': 'squared_hinge'}

optimized_bayes = BernoulliNB(alpha = 0.01)
optimized_tree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 10, min_samples_split = 10, random_state = 42)
optimized_svm = svm.LinearSVC(C = 0.5, loss = 'squared_hinge', random_state = 42)

print('Optimal NB f1 for train, valid, test', model_train_valid_test(optimized_bayes))
print('Optimal Tree f1 for train, valid, test', model_train_valid_test(optimized_tree))
print('Optimal SVM f1 for train, valid, test', model_train_valid_test(optimized_svm))

Optimal NB f1 for train, valid, test (0.7478571428571429, 0.428, 0.4395)
Optimal Tree f1 for train, valid, test (0.5022857142857143, 0.413, 0.3895)
Optimal SVM f1 for train, valid, test (0.9931428571428571, 0.465, 0.4475)


### Question 3
Now we will repeat question 2 but with frequency bag-of-words (FBoW) representation.
* Train Naive Bayes, Decision Trees, and Linear SVM for this task.
* Report the list of hyper-parameters you considered for each classifier, the range of the individual hyper-parameters and the best value for these hyper-parameters chosen based on the validation set performance.
* Report training, validation, and test F1-measure for all the classifiers (with best hyper-parameter configuration).
* Comment about the performance of different classifiers. Why did a particular classifier perform better than the others? What was the role of that hyperparameter that fetched you the best results.
* Compare the performance with the binary bag-of-words based classifiers. Why the difference in performance? Give a brief explanation comparing BBoW Naive Bayes and FBoW Naive Bayes and similarly for Decision Trees and Linear SVM.
* Which representation is better? Why?

In [8]:
freq_count_vectorizer_yelp = CountVectorizer(max_features = 10000, binary = False) 
freq_count_vectorizer_imdb = CountVectorizer(max_features = 10000, binary = False) 

vectorized_yelp_train = freq_count_vectorizer_yelp.fit_transform(yelp_train['review'])
vectorized_yelp_test = freq_count_vectorizer_yelp.transform(yelp_test['review'])
vectorized_yelp_valid = freq_count_vectorizer_yelp.transform(yelp_valid['review'])

normalize = Normalizer(norm='l1')

vectorized_yelp_train = normalize.transform(vectorized_yelp_train)
vectorized_yelp_test = normalize.transform(vectorized_yelp_test)
vectorized_yelp_valid = normalize.transform(vectorized_yelp_valid)

In [9]:
print('Classifier Tests\n')
print('Random Dummy Classifier')
print(model_validation(random_dummy))
print('Majority Dummy Classifier')
print(model_validation(majority_dummy))
print('Naive Bayes Classifier')
print('Decision Tree Classifier')
print('SVM Classifier')

for param in classifiers: # cycle through the classifiers and parameters
    clf = param[0]
    param_grid = param[1]
    print(clf)    
    best_clf = optimize_parameters(clf,param_grid) 

Classifier Tests

Random Dummy Classifier
0.212
Majority Dummy Classifier
0.356
Naive Bayes Classifier
Decision Tree Classifier
SVM Classifier
<class 'sklearn.naive_bayes.BernoulliNB'>
Test for: {'alpha': 1e-05}
F1 Score Validation:0.3970000000000001

Test for: {'alpha': 0.0001}
F1 Score Validation:0.404

Test for: {'alpha': 0.001}
F1 Score Validation:0.421

Test for: {'alpha': 0.01}
F1 Score Validation:0.425

Test for: {'alpha': 0.1}
F1 Score Validation:0.402

Test for: {'alpha': 0.3}
F1 Score Validation:0.393

Test for: {'alpha': 0.6}
F1 Score Validation:0.391

Test for: {'alpha': 1}
F1 Score Validation:0.383

Best params for Validation: {'alpha': 0.01}
Best F1 Score on Validation: 0.425

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Test for: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 3}
F1 Score Validation:0.329

Test for: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 5}
F1 Score Validation:0.342

Test for: {'criterion': 'gini', 'max_depth': 

In [10]:
# Optimal Parameters derived from above:
# NB: {'aplha': 0.01}
# Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10}
# Linear SVC: {'C': 0.5, 'loss': 'squared_hinge'}

optimized_bayes = BernoulliNB(alpha = 0.01)
optimized_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 10, random_state = 42)
optimized_svm = svm.LinearSVC(C = 0.5, loss = 'squared_hinge', random_state = 42)

print('Optimal NB f1 for train, valid, test', model_train_valid_test(optimized_bayes))
print('Optimal Tree f1 for train, valid, test', model_train_valid_test(optimized_tree))
print('Optimal SVM f1 for train, valid, test', model_train_valid_test(optimized_svm))

Optimal NB f1 for train, valid, test (0.7482857142857143, 0.425, 0.437)
Optimal Tree f1 for train, valid, test (0.5434285714285715, 0.39, 0.3785)
Optimal SVM f1 for train, valid, test (0.48328571428571426, 0.43099999999999994, 0.4545)


### Question 4
Repeat Question 2 for IMDB.

In [11]:
print('Classifier Tests\n')
print('Random Dummy Classifier')
print(model_validation(random_dummy), False)
print('Majority Dummy Classifier')
print(model_validation(majority_dummy), False)
print('Naive Bayes Classifier')
print('Decision Tree Classifier')
print('SVM Classifier')

for param in classifiers: # cycle through the classifiers and parameters
    clf = param[0]
    param_grid = param[1]
    print(clf)    
    best_clf = optimize_parameters(clf,param_grid, False) 

Classifier Tests

Random Dummy Classifier
0.223 False
Majority Dummy Classifier
0.356 False
Naive Bayes Classifier
Decision Tree Classifier
SVM Classifier
<class 'sklearn.naive_bayes.BernoulliNB'>
Test for: {'alpha': 1e-05}
F1 Score Validation:0.842041189002739

Test for: {'alpha': 0.0001}
F1 Score Validation:0.8422440904940651

Test for: {'alpha': 0.001}
F1 Score Validation:0.8421266233766233

Test for: {'alpha': 0.01}
F1 Score Validation:0.8423615337796714

Test for: {'alpha': 0.1}
F1 Score Validation:0.8422440904940651

Test for: {'alpha': 0.3}
F1 Score Validation:0.8418916176172113

Test for: {'alpha': 0.6}
F1 Score Validation:0.8411328799106691

Test for: {'alpha': 1}
F1 Score Validation:0.840682788051209

Best params for Validation: {'alpha': 0.01}
Best F1 Score on Validation: 0.8423615337796714

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Test for: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 3}
F1 Score Validation:0.6990947975728637

Test for: {'criterio

In [12]:
# Optimal Parameters derived from above:
# NB: {'aplha': 0.01}
# Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10}
# Linear SVC: {'C': 0.5, 'loss': 'squared_hinge'}

optimized_bayes = BernoulliNB(alpha = 0.01)
optimized_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 10, random_state = 42)
optimized_svm = svm.LinearSVC(C = 0.5, loss = 'squared_hinge', random_state = 42)

print('Optimal NB f1 for train, valid, test', model_train_valid_test(optimized_bayes, False))
print('Optimal Tree f1 for train, valid, test', model_train_valid_test(optimized_tree, False))
print('Optimal SVM f1 for train, valid, test', model_train_valid_test(optimized_svm, False))

Optimal NB f1 for train, valid, test (0.8718817787418656, 0.8423615337796714, 0.8318656900666611)
Optimal Tree f1 for train, valid, test (0.7708098635623983, 0.7296233839235526, 0.7262984336356142)
Optimal SVM f1 for train, valid, test (1.0, 0.8468016843793864, 0.836321341194401)


### Question 5
Repeat Question 3 for IMDB.

In [13]:
freq_count_vectorizer_imdb = CountVectorizer(max_features = 10000, binary = False) 

vectorized_imdb_train = freq_count_vectorizer_imdb.fit_transform(imdb_train['review'])
vectorized_imdb_test = freq_count_vectorizer_imdb.transform(imdb_test['review'])
vectorized_imdb_valid = freq_count_vectorizer_imdb.transform(imdb_valid['review'])

normalize = Normalizer(norm='l1')

vectorized_imdb_train = normalize.transform(vectorized_imdb_train)
vectorized_imdb_test = normalize.transform(vectorized_imdb_test)
vectorized_imdb_valid = normalize.transform(vectorized_imdb_valid)

In [14]:
print('Classifier Tests\n')
print('Random Dummy Classifier')
print(model_validation(random_dummy), False)
print('Majority Dummy Classifier')
print(model_validation(majority_dummy), False)
print('Naive Bayes Classifier')
print('Decision Tree Classifier')
print('SVM Classifier')

for param in classifiers: # cycle through the classifiers and parameters
    clf = param[0]
    param_grid = param[1]
    print(clf)    
    best_clf = optimize_parameters(clf,param_grid, False)

Classifier Tests

Random Dummy Classifier
0.213 False
Majority Dummy Classifier
0.356 False
Naive Bayes Classifier
Decision Tree Classifier
SVM Classifier
<class 'sklearn.naive_bayes.BernoulliNB'>
Test for: {'alpha': 1e-05}
F1 Score Validation:0.839167935058346

Test for: {'alpha': 0.0001}
F1 Score Validation:0.8393382726073279

Test for: {'alpha': 0.001}
F1 Score Validation:0.8397442403328935

Test for: {'alpha': 0.01}
F1 Score Validation:0.8398497004163704

Test for: {'alpha': 0.1}
F1 Score Validation:0.8402234636871508

Test for: {'alpha': 0.3}
F1 Score Validation:0.8403088175538399

Test for: {'alpha': 0.6}
F1 Score Validation:0.8404352689921692

Test for: {'alpha': 1}
F1 Score Validation:0.8398982188295165

Best params for Validation: {'alpha': 0.6}
Best F1 Score on Validation: 0.8404352689921692

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Test for: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 3}
F1 Score Validation:0.6914176476506692

Test for: {'criterio

In [16]:
# Optimal Parameters derived from above:
# NB: {'aplha': 0.6}
# Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10}
# Linear SVC: {'C': 100.0, 'loss': 'squared_hinge', 'random_state': 42}

optimized_bayes = BernoulliNB(alpha = 0.6)
optimized_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 10, random_state = 42)
optimized_svm = svm.LinearSVC(C = 100.0, loss = 'squared_hinge', random_state = 42)

print('Optimal NB f1 for train, valid, test', model_train_valid_test(optimized_bayes, False))
print('Optimal Tree f1 for train, valid, test', model_train_valid_test(optimized_tree, False))
print('Optimal SVM f1 for train, valid, test', model_train_valid_test(optimized_svm, False))

Optimal NB f1 for train, valid, test (0.86973648465091, 0.8404352689921692, 0.8298329750289399)
Optimal Tree f1 for train, valid, test (0.7947765525246664, 0.7427874139283535, 0.7464284472870103)
Optimal SVM f1 for train, valid, test (0.9490737666821593, 0.8784158415841584, 0.874476820664089)
