## Text Classification with NLTK 

- The objective of this example is to take a corpus of 'movie_reviews' from the nltk.corpus example datasets and predict whether the content in an out-of-sample review would be classified as "Negative" or "Positive".  

In [124]:
### Example
import nltk
#nltk.download('wordnet')

from nltk import word_tokenize,sent_tokenize
from nltk import PorterStemmer
from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk import FreqDist

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn import metrics
from sklearn.cross_validation import KFold, cross_val_score

import pandas as pd
import matplotlib as plt
import time
import numpy as np
%matplotlib inline
#Jupyter Notebook Options
from IPython.display import display

In [125]:
import sys
print("System Version: {}\n".format(sys.version))
print(nltk.__file__) #print location of nltk packages

System Version: 3.5.2 |Anaconda 4.2.0 (x86_64)| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]

/Users/tracesmith/anaconda/lib/python3.5/site-packages/nltk/__init__.py


#### Read in Reviews and Categories to Tuple and convert to Pandas DataFrame

In [126]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [127]:
df = pd.DataFrame(list(documents), columns=["Review","Label"])
df['Review'].str.lower()
display(df.head())

Unnamed: 0,Review,Label
0,"[plot, :, two, teen, couples, go, to, a, churc...",neg
1,"[the, happy, bastard, ', s, quick, movie, revi...",neg
2,"[it, is, movies, like, these, that, make, a, j...",neg
3,"["", quest, for, camelot, "", is, warner, bros, ...",neg
4,"[synopsis, :, a, mentally, unstable, man, unde...",neg


#### Check Value Count and Add Dummy Variable to Label

In [128]:
df.head()

Unnamed: 0,Review,Label
0,"[plot, :, two, teen, couples, go, to, a, churc...",neg
1,"[the, happy, bastard, ', s, quick, movie, revi...",neg
2,"[it, is, movies, like, these, that, make, a, j...",neg
3,"["", quest, for, camelot, "", is, warner, bros, ...",neg
4,"[synopsis, :, a, mentally, unstable, man, unde...",neg


In [129]:
#series method to count values
df['Label'].value_counts()

pos    1000
neg    1000
Name: Label, dtype: int64

In [130]:
#Replace Labels with dummy variables
df['Label'].replace(['pos','neg'],[1,0],inplace=True)

In [131]:
df['Review'] = df['Review'].astype(str)

#### Split into Testing and Training

In [132]:
def traintest_split(df):
    training,testing = train_test_split(df,test_size=0.15,random_state=1)
    X_train, y_train = training.iloc[:,0], training.iloc[:,1]
    X_test, y_test = testing.iloc[:,0], testing.iloc[:,1]
    return(X_train,y_train,X_test,y_test)

Note: In order to input into countvectorizer(), must use 1D series (both x and y). X should be 1D becasue it's going to be transformed to 2D by countvectorizer()

In [133]:
X_train,y_train,X_test,y_test = traintest_split(df)
print("Number of obserbations: {}".format(len(df['Review'])))
print("X_train size: {} -- y_train size: {}".format(X_train.shape[0],y_train.shape[0]))
print("X_test size: {} -- y_test size: {}".format(X_test.shape[0],y_test.shape[0]))

Number of obserbations: 2000
X_train size: 1700 -- y_train size: 1700
X_test size: 300 -- y_test size: 300


### Create Bag of Words:

- Create Document Term Matrix and train model on (i.e. this gives us "X")

In [137]:
def get_document_term_matrix_train_test(X_train,X_test):
    vect = CountVectorizer(stop_words='english',lowercase=True)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    df_dtm = pd.DataFrame(X_train_dtm.toarray(),columns=vect.get_feature_names())
    #X_test is transformed to document term matrix 
    X_test_dtm = vect.transform(X_test)
    return(X_train_dtm,X_test_dtm,df_dtm)

df_dtm = get_document_term_matrix_train_test(X_train,X_test)[2]
print("Number of Vocabulary Words in Document Term Matrix: {}".format(df_dtm.shape[1]))

Number of Vocabulary Words in Document Term Matrix: 36971


Note: cannot reconstruct original "reviews" from the transformation. Only do a transform on the testing --> not fit on testing set. If word is not seen in model training then it is dropped in order to mantain the dimensions of the document term matrix

In [138]:
df_dtm.head() #print first 5 rows of DTM

Unnamed: 0,00,000,0009f,007,00s,03,04,05,05425,10,...,zuko,zukovsky,zulu,zundel,zus,zweibel,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Building and Evaluating Predictive Models

In [139]:
classifiers = [MultinomialNB(),LogisticRegression(),SVC(),LinearSVC()] 

In [140]:
def test_model(classifiers,df):
    
        start = time.time()
        
        #split data up into training and testing sets:
        X_train,y_train,X_test,y_test = traintest_split(df)

        #Generate Document Term Matrix & X_test is transformed to document term matrix 
        X_train_dtm, X_test_dtm, df_dtm = get_document_term_matrix_train_test(X_train,X_test)
        print("*****Training Set*****:")
        print("Vocabulary Words: {}".format(X_train_dtm.shape[1]))
        print("Total Documents: {}".format(X_train_dtm.shape[0]))
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

        print("*****Testing Set*****:")
        print("Vocabulary Words: {}".format(X_test_dtm.shape[1]))
        print("Total Documents: {}".format(X_test_dtm.shape[0]))

        clf.fit(X_train_dtm,y_train)

        #Make prediction on the testing set:
        y_pred_class = clf.predict(X_test_dtm)

        #Accuracy Score
        acc = metrics.accuracy_score(y_test,y_pred_class)*100
        print("Accuracy: {}".format(np.round(acc,3)))
            
        #confusion matrix
        #conf_matrix = metrics.confusion_matrix(y_test,y_pred_class)
        
        end = time.time()
        total_time = end-start
        m,s = divmod(total_time,60)
        h,m = divmod(m,60)
        time_format = "%d:%02d:%02d" % (h,m,s)
        print("Total Time: {}".format(time_format))
        return(clf_name,acc,time_format)

In [141]:
df_results = pd.DataFrame(columns=['Model','Accuracy','Run Time'])

for clf in classifiers:
    clf_name = clf.__class__.__name__
    print("==================================")
    print(clf_name)
    print("==================================")
    for i in range(0,1):
        print("\nIteration:{}".format(i+1))
        clf_name,acc,time_format = test_model(clf,df)
        df_results = df_results.append({'Model':clf_name,
                                        'Accuracy':acc,
                                        'Run Time':time_format},ignore_index=True)
    print("\n")
print("\n**Results**\n",df_results)
print()

print("-----------------------------")
print("Mean Results")
print("-----------------------------")

avg_acc = df_results['Accuracy'].groupby(df_results['Model']).mean()
print("Average Accuracy", avg_acc)


MultinomialNB

Iteration:1
*****Training Set*****:
Vocabulary Words: 36971
Total Documents: 1700
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*****Testing Set*****:
Vocabulary Words: 36971
Total Documents: 300
Accuracy: 77.333
Total Time: 0:00:02


LogisticRegression

Iteration:1
*****Training Set*****:
Vocabulary Words: 36971
Total Documents: 1700
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*****Testing Set*****:
Vocabulary Words: 36971
Total Documents: 300
Accuracy: 81.0
Total Time: 0:00:02


SVC

Iteration:1
*****Training Set*****:
Vocabulary Words: 36971
Total Documents: 1700
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*****Testing Set*****:
Vocabulary Words: 36971
Total Documents: 300
Accuracy: 52.333
Total Time: 0:00:10


LinearSVC

Iteration:1
*****Training Set*****:
Vocabulary Words: 36971
Total Documents: 1700
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*****Testing Set*****:
Vocabulary Words: 36971
Total Documents: 300
Accuracy: 80.333
Total Time: 0:00:02



**Results**
                 Model

### Naive Bayes

In [142]:
classifier = MultinomialNB()

In [181]:
def test_model_cross_val(classifiers,X_train,y_train,X_test,y_test):
    
        start = time.time()

        #Generate Document Term Matrix & X_test is transformed to document term matrix 
        X_train_dtm, X_test_dtm, df_dtm = get_document_term_matrix_train_test(X_train,X_test)
        
        clf.fit(X_train_dtm,y_train)

        #Make prediction on the testing set:
        y_pred_class = clf.predict(X_test_dtm)

        #Accuracy Score
        acc = metrics.accuracy_score(y_test,y_pred_class)*100
        print("Accuracy: {}".format(np.round(acc,3)))
                
        end = time.time()
        total_time = end-start
        m,s = divmod(total_time,60)
        h,m = divmod(m,60)
        time_format = "%d:%02d:%02d" % (h,m,s)
        print("Total Time: {}".format(time_format))
        
        #confusion matrix
        conf_matrix = metrics.confusion_matrix(y_test,y_pred_class)
        print("\nConfusion Matrix:",conf_matrix)
        print("\n")
        
        return(acc,time_format)

**K-Fold CrossValidaiton**

In [182]:
print("**************************************************")
print("Model: {} -- K-Fold Cross-Validation".format(classifier.__class__.__name__))
print("**************************************************\n")

x = df.iloc[:,0]
y = df.iloc[:,1]
fold = 0
accuracy = []

KF = KFold(len(df),3, random_state=1, shuffle=True)
for index, (train_index, test_index) in list(enumerate(KF,start=1)):
    print("Fold: {}".format(index))
    print("Train_Index:{}....".format(train_index[1:10]))
    print("Test_Index: {}....".format(test_index[1:10]))
    X_train = x.loc[train_index]; X_test= x.loc[test_index]
    y_train = y.loc[train_index];y_test = y.loc[test_index]
    acc,time_format = test_model_cross_val(classifiers,X_train,y_train,X_test,y_test)
    accuracy.append(acc)
    print("==========================================\n")
print("Mean Accuracy = {}%".format(round(np.mean(accuracy))))

**************************************************
Model: MultinomialNB -- K-Fold Cross-Validation
**************************************************

Fold: 1
Train_Index:[ 4  5  6  7  9 10 12 13 14]....
Test_Index: [ 1  3  8 11 16 17 19 22 30]....
Accuracy: 81.859
Total Time: 0:00:02

Confusion Matrix: [[270  59]
 [ 62 276]]



Fold: 2
Train_Index:[ 1  3  4  7  8 11 15 16 17]....
Test_Index: [ 5  6  9 10 12 13 14 23 26]....
Accuracy: 83.658
Total Time: 0:00:02

Confusion Matrix: [[291  48]
 [ 61 267]]



Fold: 3
Train_Index:[ 1  2  3  5  6  8  9 10 11]....
Test_Index: [ 7 15 18 20 21 24 25 29 36]....
Accuracy: 84.835
Total Time: 0:00:02

Confusion Matrix: [[287  45]
 [ 56 278]]



Mean Accuracy = 83.0%
