In [53]:
import pandas as pd
import numpy as np
import re
import collections
import random
from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn import tree, svm
#data import
#taken from Reddit discussion: https://www.reddit.com/r/McGill_comp551s1/comments/7x2sdy/assignment3/
yelp_train = pd.read_csv('Datasets/yelp-train.txt',sep='\t',names=["Comment", "Evaluation"])
yelp_test = pd.read_csv('Datasets/yelp-test.txt',sep='\t',names=["Comment", "Evaluation"])
imdb_train = pd.read_csv('Datasets/IMDB-train.txt',sep='\t',names=["Comment", "Evaluation"])
imdb_test =  pd.read_csv('Datasets/IMDB-test.txt',sep='\t',names=["Comment", "Evaluation"])

In [4]:
#data clean up
def pre_process(review):
    #we remove the breaks from the dataset
    review = re.sub('<br /><br />', '', review)
    return re.sub(r'[^a-zA-Z ]', '', review).lower()

#remove all punctuation and put to lower case
for i, row in yelp_train.iterrows():
    yelp_train.set_value(i,'Comment',pre_process(row['Comment']))
for i, row in imdb_train.iterrows():
    imdb_train.set_value(i,'Comment',pre_process(row['Comment']))

In [5]:
def get_vocabulary(training_set):
    # Now go through and find the frequency of each word
    unsorted_vocabulary = {}

    #populate the unsorted vocab
    for review in training_set['Comment']:
        #filter removes the blank spaces
        for word in filter(None, review.split(' ')):
            if(word in unsorted_vocabulary):
                    unsorted_vocabulary[word] += 1
            else:
                unsorted_vocabulary[word] = 1

    #now we will create a sorted vocab of the top 10000 words
    return collections.Counter(unsorted_vocabulary).most_common(10000)

In [6]:
def generate_bbow_vector(words, sentence):
    sentence = pre_process(sentence)
    vector = np.zeros(10000)
    for word in filter(None, sentence.split(' ')):
        if(word in words):
            vector[words.index(word)] = 1
    return vector

def generate_freq_vector(words, sentence):
    sentence = pre_process(sentence)
    vector = np.zeros(10000)
    total_words_in_vocab = 0
    for word in filter(None, sentence.split(' ')):
        if(word in words):
            total_words_in_vocab += 1.
            vector[words.index(word)] += 1.
    for i in range(0, len(vector)):
        vector[i] = vector[i] / total_words_in_vocab
    return vector

In [7]:
yelp_vocabulary = get_vocabulary(yelp_train)
yelp_words = [i[0] for i in yelp_vocabulary]

imdb_vocabulary = get_vocabulary(imdb_train)
imdb_words = [i[0] for i in imdb_vocabulary]

# print(generate_bbow_vector(yelp_words, "willow and of the"))
# print(generate_freq_vector(yelp_words, "willow asdlkfjalsdfk and and of the"))
# print(generate_bbow_vector(imdb_words, "willow and of the"))
# print(generate_freq_vector(imdb_words, "willow asdlkfjalsdfk and and of the"))

[ 1.  1.  0. ...,  0.  0.  1.]
[ 0.2  0.4  0.  ...,  0.   0.   0.2]
[ 1.  0.  1. ...,  0.  0.  0.]
[ 0.25  0.    0.5  ...,  0.    0.    0.  ]


In [8]:
# Now we will create the IMDB and YELP datasets, as described in the report handout
def create_vocab_file(vocabulary, filename):
    word_count = 1
    with open(filename, 'w') as the_file:
        for i, item in vocabulary:
            the_file.write(str(word_count) + "\t" + str(i) + "\t" + str(item) + "\n")
            word_count +=1

create_vocab_file(yelp_vocabulary, "Datasets/yelp-vocab.txt")
create_vocab_file(imdb_vocabulary, "Datasets/IMDB-vocab.txt")

In [27]:
#From here on in, we will be actually training and testing

# this method will take in a bunch of data and calculate F1
def random_classifier(train_data, test_data, min_class, max_class):
    test_prediction = []
    train_prediction = []
    
    # go through data and choose a random class
    for i in range(0, len(train_data)):
        class_predict = random.randint(min_class, max_class)
        train_prediction.append(class_predict)
    for i in range(0, len(test_data)):
        class_predict = random.randint(min_class, max_class)
        test_prediction.append(class_predict)
    return train_prediction, test_prediction


In [28]:
#Yelp Random
yelp_random_train_predict, yelp_random_test_predict = random_classifier(yelp_train, yelp_test, 1, 5)
yelp_rand_train_f1 = f1_score(yelp_train['Evaluation'], yelp_random_train_predict, average='micro')
yelp_rand_test_f1 = f1_score(yelp_test['Evaluation'], yelp_random_test_predict, average='micro')

print("Train f1 is " + str(yelp_rand_train_f1) + ", Test F1 is " + str(yelp_rand_test_f1))

Train f1 is 0.200285714286, Test F1 is 0.1965


In [30]:
#IMDB Random

imdb_random_train_predict, imdb_random_test_predict = random_classifier(imdb_train, imdb_test, 0,1)
imdb_rand_train_f1 = f1_score(imdb_train['Evaluation'], imdb_random_train_predict)
imdb_rand_test_f1 = f1_score(imdb_test['Evaluation'], imdb_random_test_predict)


print("Train f1 is " + str(imdb_rand_train_f1) + ", Test F1 is " + str(imdb_rand_test_f1))

Train f1 is 0.499833477653, Test F1 is 0.497674418605


In [31]:
def majority_classifier(train_data, test_data):
    train_mode = train_data['Evaluation'].mode()[0]
    test_mode = test_data['Evaluation'].mode()[0]
    
    train_prediction = [train_mode] * len(train_data)
    test_prediction = [test_mode] * len(test_data)
    return train_prediction, test_prediction

In [33]:
#Yelp Majority Classifier
yelp_maj_train_predict, yelp_maj_test_predict = majority_classifier(yelp_train, yelp_test)
yelp_rand_train_f1 = f1_score(yelp_train['Evaluation'], yelp_maj_train_predict, average='micro')
yelp_rand_test_f1 = f1_score(yelp_test['Evaluation'], yelp_maj_test_predict, average='micro')

print("Train f1 is " + str(yelp_rand_train_f1) + ", Test F1 is " + str(yelp_rand_test_f1))

Train f1 is 0.352571428571, Test F1 is 0.351


In [41]:
#IMDB Majority Classifier
imdb_maj_train_predict, imdb_maj_test_predict = majority_classifier(imdb_train, imdb_test)
imdb_rand_train_f1 = accuracy_score(imdb_train['Evaluation'], imdb_maj_train_predict)
imdb_rand_test_f1 = accuracy_score(imdb_test['Evaluation'], imdb_maj_test_predict)

print("Train accuracy is " + str(imdb_rand_train_f1) + ", Test accuracy is " + str(imdb_rand_test_f1))

Train accuracy is 0.5, Test accuracy is 0.5


In [46]:
#now we will do naive bayes.
yelp_vocabulary = get_vocabulary(yelp_train)
yelp_words = [i[0] for i in yelp_vocabulary]

#first make the training features matrix
training_features_matrix = []
training_target = []

for sentence in yelp_train['Comment']:
    training_features_matrix.append(generate_bbow_vector(yelp_words, sentence))

for res in yelp_train['Evaluation']:
    training_target.append(res)

testing_features_matrix = []
testing_target = []

for sentence in yelp_test['Comment']:
    testing_features_matrix.append(generate_bbow_vector(yelp_words, sentence))

for res in yelp_test['Evaluation']:
    testing_target.append(res)

In [48]:
gnb = GaussianNB()
gnb.fit(training_features_matrix, training_target)
train_target_pred = gnb.predict(training_features_matrix)
test_target_pred = gnb.predict(testing_features_matrix)

train_gnb_f1 = f1_score(yelp_train['Evaluation'], train_target_pred, average='micro')
test_gnb_f1 = f1_score(yelp_test['Evaluation'], test_target_pred, average='micro')

print("GaussianNB: Train F1 Accuracy = " + str(train_gnb_f1) + ", Test F1 Accuarcy = " + str(test_gnb_f1))

Train F1 Accuracy = 0.434285714286, Test F1 Accuarcy = 0.2265


In [51]:
dtc = tree.DecisionTreeClassifier()
dtc.fit(training_features_matrix, training_target)
train_target_pred = dtc.predict(training_features_matrix)
test_target_pred = dtc.predict(testing_features_matrix)

train_gnb_f1 = f1_score(yelp_train['Evaluation'], train_target_pred, average='micro')
test_gnb_f1 = f1_score(yelp_test['Evaluation'], test_target_pred, average='micro')

print("Decision Trees: Train F1 Accuracy = " + str(train_gnb_f1) + ", Test F1 Accuarcy = " + str(test_gnb_f1))

Decision Trees: Train F1 Accuracy = 0.999857142857, Test F1 Accuarcy = 0.3585


In [None]:
svmc = svm.SVC()
svmc.fit(training_features_matrix, training_target)
train_target_pred = svmc.predict(training_features_matrix)
test_target_pred = svmc.predict(testing_features_matrix)

train_gnb_f1 = f1_score(yelp_train['Evaluation'], train_target_pred, average='micro')
test_gnb_f1 = f1_score(yelp_test['Evaluation'], test_target_pred, average='micro')

print("Decision Trees: Train F1 Accuracy = " + str(train_gnb_f1) + ", Test F1 Accuarcy = " + str(test_gnb_f1))