#Movie Sentiment Classification
Cody W. Eilar & Venkatesh Jatla <br/>
Date: 10/12/15

In [None]:
import pandas as pd
import numpy as np
import csv
import glob

## Get the data
In this step we use pandas to extract the raw text files from the directories and classify them based on the name of the directory.

In [None]:
def create_data_frame(directories, samples):
    frame = pd.DataFrame()
    list_ = []
    sent_ = []
    for my_dir in directories:
        num_times = 0
        for file_name in glob.iglob(my_dir): 
            if num_times < samples:
                df = pd.read_csv(file_name, sep='\n',names=['text'])
                list_.append(df)
                
                tmp = file_name.split('/')
                if "pos" in tmp: 
                    sent_.append(1)
                else: 
                    sent_.append(-1)
            else: 
                break;
            num_times = num_times+1
   
    frame = pd.concat(list_, ignore_index=True)
    frame['sentiment'] = sent_

    return frame

##Display some of the first reviews acquired

In [None]:
directories = ["/Users/cody/Downloads/aclImdb/train/pos/*.txt", "/Users/cody/Downloads/aclImdb/train/neg/*.txt"]
samples = 10000
train_reviews = create_data_frame(directories, samples)

print(train_reviews.head(10))

##Display some statistics about the data we are attempting to classify

In [None]:
train_reviews.groupby('sentiment').describe()

## Build a classification pipeline using an SVM 
This step removes all stop words and converts the words in a review into a matrix of token counts. This is then passed into a "tfidf" or term-frequency times inverse document frequency representation. Finally, this matrix is then passed into a support vector machine to train and then later to classify.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
 
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='modified_huber', penalty='l2', alpha=1e-3, n_iter=10, random_state=42)),
])

##Train the classifier

In [None]:
%time text_clf = text_clf.fit(train_reviews['text'].values, train_reviews['sentiment'].values)

In [None]:
directories = ["/Users/cody/Downloads/aclImdb/test/pos/*.txt", "/Users/cody/Downloads/aclImdb/test/neg/*.txt"]
samples = 12500
test_reviews = create_data_frame(directories, samples)

print(test_reviews.head(10))

## Check how well we performed

In [None]:
predicted = text_clf.predict(test_reviews['text'].values)
print("Score is: {}".format(np.mean(predicted == test_reviews['sentiment'].values)))
print(predicted)

## Display confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
cm = confusion_matrix(test_reviews['sentiment'].values, predicted)
%matplotlib inline
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm, interpolation='nearest')
fig.colorbar(cax)
values = ['pos', 'neg']
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
ax.set_xticklabels(['']+values)
ax.set_yticklabels(['']+values)
plt.show()