#Movie Review Classifier
Author: Cody W. Eilar & Venktatesh Jatla<br/>
Date: 10/9/15

In [None]:
import pandas
import numpy as np
import csv
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

## Read training negative reviews
Do we want a dictionary or do we want to use pandas? I left the pandas code in there just in case it is useful in the end. If we end up not using it, let's delete it. 

In [None]:
pos_train_file = "../../results/train_pos.txt"
neg_train_file = "../../results/train_neg.txt"


In [None]:
def get_array(file_name, sent_type): 
    """ 
    file_name = file to the data.
    sent_type = sentiment type (+1 for positive -1 for negative review)
    """
    data = pandas.read_csv(file_name, sep='\t', quoting=csv.QUOTE_NONE, names=["word", "counts"])
    mat = np.ones(len(data))*sent_type; 
    data['type'] = mat
    return data; 
    
    

In [None]:
neg_data = get_array(neg_train_file, -1)
pos_data = get_array(pos_train_file, 1)


## Now we need to normalize the counts
We are going to do this by dividing each word count by the total number of words in the vocabulary. 

In [None]:
def strip(text):
    """
    Remove training whitespace
    """
    try:
        return text.strip()
    except AttributeError:
        return text

vocab_file = "../../files/stemmed_vocab.txt"
vocab_data = pandas.read_csv(vocab_file, sep='\n', quoting=csv.QUOTE_NONE, names=["vocab"], converters={'vocab' : strip})
#Somehow there are duplicates in this data. 
vocab_data = vocab_data.drop_duplicates()


def normalize_freq(data_frame, vocab_data): 
    """
    Append a column with the frequency of occurence 
    """
    sum_count = data_frame["counts"].sum()
    data_frame['freq'] = data_frame[["counts"]].divide(len(vocab_data))
    return data_frame

neg_data_norm = normalize_freq(neg_data, vocab_data)
pos_data_norm = normalize_freq(pos_data, vocab_data)

 ##Limit the number of features
We currently have a lot of features so we attempt to limit those by sorting the frequencies, and then only keeping the first 5000.

In [None]:
neg_subset = neg_data_norm.sort(['counts'], ascending=0).head(5000)
pos_subset = pos_data_norm.sort(['counts'], ascending=0).head(5000)
print("*"*50)
print("The first 10 entries of the negative data")
print("*"*50)
print(neg_subset.head(10))
print("")
print("*"*50)
print("The first 10 entries of the positive data")
print("*"*50)
print(pos_subset.head(10))

##Combine the positive and negative data training sets

In [None]:
train_data = neg_subset.append(pos_subset)


In [None]:
def get_vocab_idx(vocab_data, word): 
    return vocab_data[vocab_data["vocab"] == word].index.tolist()[0]

def create_vocab(vocab_data, data_frame): 
    """
    vocab_data = The data frame representing the vocabulary
    data_frame = The data frame representing the training data
    In this step we convert the words to their integer representation in the vocabulary.
    """
    indices = []
    for word in data_frame["word"]: 
        indices.append(get_vocab_idx(vocab_data, word))
    
    data_frame['integer'] = indices; 
    return data_frame
    
print (vocab_data.head(10))

new_train_data = create_vocab(vocab_data, train_data)

##Display a few comparisons between words and the frequencies. 
We can see that for some entries that the frequencies are pretty close!

In [None]:
print(new_train_data.sort('integer', ascending=1).head(10))

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(new_train_data[['integer', 'counts']].values, y=new_train_data[['type']].values)
tfidf.toarray()

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier( penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
Y = new_train_data[['type']].values
clf.fit(tfidf, Y.ravel())
tfidf.toarray()

##Check accuracy
To see if this is a good method for classification, we now need to create a confusion matrix of the predicted lables vs the actual labels. To do this with the method we have chosen, we simply sum the predicted labels. If the sum is less than 0 than we classify it as a negative review and if is postive then we classify it as a positive review. I've taken all the steps above and codensed them into vectorize.py to make the transformation of text to matrix more readable.

In [None]:
from vectorizer import vectorizer
import glob
files_dir = ["/Users/cody/Downloads/aclImdb/train/pos/*.txt", "/Users/cody/Downloads/aclImdb/train/neg/*.txt"]            
path_to_mapper = "/Users/cody/Repos/SentimentAnalysis/build/bin/mapper"      
path_to_reducer = "/Users/cody/Repos/SentimentAnalysis/build/bin/reducer.py" 
path_to_vocab = "/Users/cody/Repos/SentimentAnalysis/files/stemmed_vocab.txt"

predicted_y = []
actual_y = []
for types in files_dir:
    iterations = 0
    for file_name in glob.iglob(types): 
        if iterations < 20: 
            test = vectorizer(file_name, path_to_mapper, path_to_reducer, path_to_vocab)
            a = clf.predict(test[0])
            if a.sum() > 0: 
                predicted_y.append(1)
            else: 
                predicted_y.append(-1); 

            if test[1].sum() > 0: 
                actual_y.append(1)
            else: 
                actual_y.append(-1)
        else: 
            break
        iterations = iterations + 1

## Get confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
cm = confusion_matrix(actual_y, predicted_y)
%matplotlib inline
print(cm)
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Results
We can see from the above plot that we are not doing a very good job of classifying. Something is not working very well. 