In [18]:
import pandas as pd
import numpy as np
import re
import collections
import random
from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn import tree, svm
#data import
#taken from Reddit discussion: https://www.reddit.com/r/McGill_comp551s1/comments/7x2sdy/assignment3/
yelp_train = pd.read_csv('Datasets/yelp-train.txt',sep='\t',names=["Comment", "Evaluation"])
yelp_test = pd.read_csv('Datasets/yelp-test.txt',sep='\t',names=["Comment", "Evaluation"])
yelp_valid = pd.read_csv('Datasets/yelp-valid.txt',sep='\t',names=["Comment", "Evaluation"])
imdb_train = pd.read_csv('Datasets/IMDB-train.txt',sep='\t',names=["Comment", "Evaluation"])
imdb_test =  pd.read_csv('Datasets/IMDB-test.txt',sep='\t',names=["Comment", "Evaluation"])
imdb_valid =  pd.read_csv('Datasets/IMDB-valid.txt',sep='\t',names=["Comment", "Evaluation"])

In [19]:
#data clean up
def pre_process(review):
    #we remove the breaks from the dataset
    review = re.sub('<br /><br />', '', review)
    return re.sub(r'[^a-zA-Z ]', '', review).lower()

#remove all punctuation and put to lower case
for i, row in yelp_train.iterrows():
    yelp_train.set_value(i,'Comment',pre_process(row['Comment']))
for i, row in imdb_train.iterrows():
    imdb_train.set_value(i,'Comment',pre_process(row['Comment']))

In [20]:
def get_vocabulary(training_set):
    # Now go through and find the frequency of each word
    unsorted_vocabulary = {}

    #populate the unsorted vocab
    for review in training_set['Comment']:
        #filter removes the blank spaces
        for word in filter(None, review.split(' ')):
            if(word in unsorted_vocabulary):
                    unsorted_vocabulary[word] += 1
            else:
                unsorted_vocabulary[word] = 1

    #now we will create a sorted vocab of the top 10000 words
    return collections.Counter(unsorted_vocabulary).most_common(10000)

In [21]:
def generate_bbow_vector(words, sentence):
    sentence = pre_process(sentence)
    vector = np.zeros(10000)
    for word in filter(None, sentence.split(' ')):
        if(word in words):
            vector[words.index(word)] = 1
    return vector

def generate_freq_vector(words, sentence):
    sentence = pre_process(sentence)
    vector = np.zeros(10000)
    total_words_in_vocab = 0
    for word in filter(None, sentence.split(' ')):
        if(word in words):
            total_words_in_vocab += 1.
            vector[words.index(word)] += 1.
    for i in range(0, len(vector)):
        if(total_words_in_vocab != 0):
            vector[i] = vector[i] / total_words_in_vocab
    return vector

In [22]:
yelp_vocabulary = get_vocabulary(yelp_train)
yelp_words = [i[0] for i in yelp_vocabulary]

imdb_vocabulary = get_vocabulary(imdb_train)
imdb_words = [i[0] for i in imdb_vocabulary]

# print(generate_bbow_vector(yelp_words, "willow and of the"))
# print(generate_freq_vector(yelp_words, "willow asdlkfjalsdfk and and of the"))
# print(generate_bbow_vector(imdb_words, "willow and of the"))
# print(generate_freq_vector(imdb_words, "willow asdlkfjalsdfk and and of the"))

In [23]:
# Now we will create the IMDB and YELP datasets, as described in the report handout
def create_vocab_file(vocabulary, filename):
    word_count = 1
    with open(filename, 'w') as the_file:
        for i, item in vocabulary:
            the_file.write(str(word_count) + "\t" + str(i) + "\t" + str(item) + "\n")
            word_count +=1

create_vocab_file(yelp_vocabulary, "Datasets/yelp-vocab.txt")
create_vocab_file(imdb_vocabulary, "Datasets/IMDB-vocab.txt")

In [24]:
#From here on in, we will be actually training and testing

# this method will take in a bunch of data and calculate F1
def random_classifier(train_data, test_data, min_class, max_class):
    test_prediction = []
    train_prediction = []
    
    # go through data and choose a random class
    for i in range(0, len(train_data)):
        class_predict = random.randint(min_class, max_class)
        train_prediction.append(class_predict)
    for i in range(0, len(test_data)):
        class_predict = random.randint(min_class, max_class)
        test_prediction.append(class_predict)
    return train_prediction, test_prediction


In [25]:
#Yelp Random
yelp_random_train_predict, yelp_random_test_predict = random_classifier(yelp_train, yelp_test, 1, 5)
yelp_rand_train_f1 = f1_score(yelp_train['Evaluation'], yelp_random_train_predict, average='micro')
yelp_rand_test_f1 = f1_score(yelp_test['Evaluation'], yelp_random_test_predict, average='micro')

print("Train f1 is " + str(yelp_rand_train_f1) + ", Test F1 is " + str(yelp_rand_test_f1))

Train f1 is 0.197714285714, Test F1 is 0.2055


In [26]:
#IMDB Random

imdb_random_train_predict, imdb_random_test_predict = random_classifier(imdb_train, imdb_test, 0,1)
imdb_rand_train_f1 = f1_score(imdb_train['Evaluation'], imdb_random_train_predict)
imdb_rand_test_f1 = f1_score(imdb_test['Evaluation'], imdb_random_test_predict)


print("Train f1 is " + str(imdb_rand_train_f1) + ", Test F1 is " + str(imdb_rand_test_f1))

Train f1 is 0.498966597773, Test F1 is 0.501858587474


In [27]:
def majority_classifier(train_data, test_data):
    train_mode = train_data['Evaluation'].mode()[0]
    test_mode = test_data['Evaluation'].mode()[0]
    
    train_prediction = [train_mode] * len(train_data)
    test_prediction = [test_mode] * len(test_data)
    return train_prediction, test_prediction

In [28]:
#Yelp Majority Classifier
yelp_maj_train_predict, yelp_maj_test_predict = majority_classifier(yelp_train, yelp_test)
yelp_rand_train_f1 = f1_score(yelp_train['Evaluation'], yelp_maj_train_predict, average='micro')
yelp_rand_test_f1 = f1_score(yelp_test['Evaluation'], yelp_maj_test_predict, average='micro')

print("Train f1 is " + str(yelp_rand_train_f1) + ", Test F1 is " + str(yelp_rand_test_f1))

Train f1 is 0.352571428571, Test F1 is 0.351


In [29]:
#IMDB Majority Classifier
imdb_maj_train_predict, imdb_maj_test_predict = majority_classifier(imdb_train, imdb_test)
imdb_rand_train_f1 = accuracy_score(imdb_train['Evaluation'], imdb_maj_train_predict)
imdb_rand_test_f1 = accuracy_score(imdb_test['Evaluation'], imdb_maj_test_predict)

print("Train accuracy is " + str(imdb_rand_train_f1) + ", Test accuracy is " + str(imdb_rand_test_f1))

Train accuracy is 0.5, Test accuracy is 0.5


In [30]:
#now we will do naive bayes.
yelp_vocabulary = get_vocabulary(yelp_train)
yelp_words = [i[0] for i in yelp_vocabulary]

#first make the training features matrix
training_inputs = [[] for i in range(2)]
testing_inputs = [[] for i in range(2)]
validation_inputs = [[] for i in range(2)]

for sentence in yelp_train['Comment']:
    training_inputs[0].append(generate_bbow_vector(yelp_words, sentence))
    training_inputs[1].append(generate_freq_vector(yelp_words, sentence))

for sentence in yelp_test['Comment']:
    testing_inputs[0].append(generate_bbow_vector(yelp_words, sentence))
    testing_inputs[1].append(generate_freq_vector(yelp_words, sentence))

for sentence in yelp_valid['Comment']:
    validation_inputs[0].append(generate_bbow_vector(yelp_words, sentence))
    validation_inputs[1].append(generate_bbow_vector(yelp_words, sentence))


In [31]:
def run_classifier(classifier, train_input, train_target, valid_input, valid_target, test_input, test_target):
    classifier.fit(train_input, train_target)
    train_target_pred = classifier.predict(train_input)
    valid_target_pred = classifier.predict(valid_input)
    test_target_pred = classifier.predict(test_input)

    train_f1 = f1_score(train_target, train_target_pred, average='micro')
    valid_f1 = f1_score(valid_target, valid_target_pred, average='micro')
    test_f1 = f1_score(test_target, test_target_pred, average='micro')
    return train_f1, valid_f1, test_f1

In [33]:
def run_classifiers(classifiers, data_index):
    #run it on each classifier and print the results (BBOW)
    for classifier in classifiers:
        metrics = run_classifier(classifier[1], training_inputs[data_index], yelp_train['Evaluation'], validation_inputs[data_index], yelp_valid['Evaluation'], testing_inputs[data_index], yelp_test['Evaluation'])
        print(str(classifier[0]) + str(metrics))

In [34]:
#Create the classifiers we want to use to train and test our data
classifiers = []
#We use bernoulli here since, with the binary bag of words, the features are binary.
classifiers.append(("Bernoulli Naive Bayes: ", BernoulliNB()))
classifiers.append(("Decision Trees: ", tree.DecisionTreeClassifier()))
classifiers.append(("SVM: ", svm.LinearSVC()))

#run the classifiers for BBOW (data_index = 0)
run_classifiers(classifiers, 0)

Bernoulli Naive Bayes: (0.59771428571428575, 0.38100000000000001, 0.40949999999999998)
Decision Trees: (1.0, 0.34999999999999998, 0.34999999999999998)
SVM: (0.998, 0.441, 0.44900000000000001)


In [None]:
#We will now alter our classifiers list for the frequency representation
classifiers[0] = ("Gaussian Naive Bayes: ", GaussianNB())
# Now we rerun with data_index = 1 (Frequency data)
run_classifiers(classifiers, 1)

In [None]:
#And then run it on each classifier and print the results (Frequency)