In [1]:
import pandas as pd
import numpy as np
import numpy.random
from numpy.linalg import norm
import math
import time

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from itertools import chain
import string

import re

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

import gensim.downloader as api
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.initializers import Constant
from matplotlib import pyplot as pl

from IPython.core.debugger import set_trace

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/averyvine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/averyvine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/averyvine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
punctuation = string.punctuation + "“”‘’—–…0123456789\n"
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def __clean_review(review, rnn):
    # Remove punctuation
    review = review.translate(str.maketrans('', '', punctuation))
    
    # Tokenize
    words = word_tokenize(review)
    words = [word.lower() for word in words]
    
    # Remove stop words and gibberish
    words = [word for word in words if (word.isalpha()) and (not word in stop_words)]
    
    # Apply stemming / lemmatization (only if not running RNN)
    if not rnn:
        # Apply stemming (if enabled, comment out lemmatization)
#         words = [porter.stem(word) for word in words]
        # Apply lemmatization (if enabled, comment out stemming)
        words = [lemmatizer.lemmatize(word) for word in words]
    
    # Rejoin words
    review = " ".join(words)
    return review



def clean_review_rnn(review):
    return __clean_review(review, rnn=True)

def clean_review(review):
    return __clean_review(review, rnn=False)

In [3]:
numberOfRows = 1000

def getInitTrainDataFrame(filename):
    df = pd.read_csv(filename)
    # uncomment to limit training data
    #df = df[0:numberOfRows]
    print("Cleaning training review data...")
    df["review"] = df["review"].apply(cleanReview)
    df["sentiment"] = df["sentiment"].apply(lambda sentiment: 0 if sentiment == "negative" else 1)
    print(df)
    return (df["review"].values, df["sentiment"].values)

In [4]:
numberOfRows = 1000

def getInitTestDataFrame(filename):
    df = pd.read_csv(filename)
    # uncomment to limit testing data
    #df = df[0:numberOfRows]
    print("Cleaning test review data...")
    df["review"] = df["review"].apply(cleanReview)
    print(df)
    return df["review"].values

In [5]:
def getTrainMatrix(reviewArr):
    vectorizer = CountVectorizer(binary=True, max_features=5000, ngram_range=(1,1))
    X = vectorizer.fit_transform(reviewArr)
    return X, vectorizer.get_feature_names()

In [6]:
def getTestMatrix(reviewArr, featureArr):
    X = []
    
    for review in reviewArr:
        featureDict = dict.fromkeys(featureArr, 0)
        
        for word in review.split():
            if word in featureDict:
                featureDict[word] = 1
        
        X.append(list(featureDict.values()))
        
    return X

In [7]:
class NaiveBayes:
        
    # parameters needed for prediction, set in fit()
    _theta_j_0 = None
    _theta_j_1 = None
    _log_theta = None
    
    
    def fit(self, X, y):
        theta_1 = np.sum(y == 1) / float(len(y))
        theta_0 = 1 - theta_1
        self._log_theta = math.log(theta_1 / theta_0)
        
        n, m = X.shape
        
        totalMatrix = np.insert(X, m, y, axis=1)     # temporary, for easier vector manipulation 
        
        totalMatrixY0 = totalMatrix[np.where(totalMatrix[:,-1]==0)]
        totalMatrixY0 = np.delete(totalMatrixY0, -1, axis=1)
        self._theta_j_0 = (np.sum(totalMatrixY0, axis=0)+1)/(float(np.sum(np.where(y==0))+2))
        
        totalMatrixY1 = totalMatrix[np.where(totalMatrix[:,-1]==1)]
        totalMatrixY1 = np.delete(totalMatrixY1, -1, axis=1)
        self._theta_j_1 = (np.sum(totalMatrixY1, axis=0)+1)/(float(np.sum(np.where(y==1))+2))
        
        
    def predict(self, X):
        
        n,m = X.shape
        
        log_theta_j = np.log(self._theta_j_1 / self._theta_j_0)
        log_negative_theta_j = np.log((1 - self._theta_j_1) / (1 - self._theta_j_0))
        predictedLabels = self._log_theta + (np.matmul(X, log_theta_j) + np.matmul((1 - X), log_negative_theta_j))
        
        predictedLabels[np.where(predictedLabels>=0)] = 1
        predictedLabels[np.where(predictedLabels<0)] = 0
        
        return predictedLabels

In [8]:
# Input: predicted labels, true labels 
# Output: accuracy score
def accuEval(y_predict, y_true):
    
    totalCorrect = np.sum(y_predict==y_true)
    totalRows = len(y_predict)
    
    return(totalCorrect/totalRows)

In [9]:
# Download GloVe twitter set (separate cell for convenience)
model = api.load("glove-twitter-100")

In [12]:
def main_lstm():
    trainFile = "train.csv"
    testFile = "test.csv"
    
    print("Getting initial training and rating arrays...")
    reviewsTrain, ratingsTrain = getInitTrainDataFrame(trainFile, rnn=True)
    
    print("Getting initial test array...")
    reviewsTest = getInitTestDataFrame(testFile, rnn=True)
    
    max_length = max([len(s.split()) for s in reviewsTrain])
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(reviewsTrain)
    sequences = tokenizer.texts_to_sequences(reviewsTrain)
    word_index = tokenizer.word_index
    reviews_pad = pad_sequences(sequences, maxlen=max_length)
    
    indices = np.arange(reviews_pad.shape[0])
    np.random.shuffle(indices)
    reviews_pad = reviews_pad[indices]
    ratingsTrain = ratingsTrain[indices]

    EMBEDDING_DIM = 100
    BATCH_SIZE = 32
    NUM_EPOCHS = 50
    
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > num_words or not word in model.vocab:
            continue
        embedding_vector = model[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    keras_model = Sequential()
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=max_length,
                                trainable=False)
    keras_model.add(embedding_layer)
    keras_model.add(LSTM(EMBEDDING_DIM, dropout=0.2, recurrent_dropout=0.2))
    keras_model.add(Dense(1, activation='sigmoid'))
    keras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = keras_model.fit(reviews_pad, ratingsTrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS)
    
    sequences_test = tokenizer.texts_to_sequences(reviewsTest)
    reviews_test_pad = pad_sequences(sequences_test, maxlen=max_length)
    
    predictions = keras_model.predict(reviews_test_pad, batch_size=BATCH_SIZE)
    predictions[np.where(predictions>=0.5)] = 1
    predictions[np.where(predictions<0.5)] = 0
    np.asarray(predictions, dtype=int)
    predict_df = pd.DataFrame(predictions, columns=['sentiment'])
    predict_df.index.names = ['id']
    predict_df["sentiment"] = predict_df["sentiment"].apply(lambda sentiment: "negative" if sentiment == 0 else "positive")
    predict_df.to_csv("submission.csv")
    print("Done!")

In [252]:
def main():

    trainFile = "train.csv"
    testFile = "test.csv"
    
    print("Getting initial training and rating arrays...")
    reviewsTrain, ratingsTrain = getInitTrainDataFrame(trainFile)
    
    print("Getting training matrix and feature array...")
    X_train, featureArr = getTrainMatrix(reviewsTrain)
    
    print("Getting initial test array...")
    reviewsTest = getInitTestDataFrame(testFile)
    
    print("Getting test matrix...")
    X_test = getTestMatrix(reviewsTest, featureArr)
    
    X_train = np.asarray(X_train.toarray())
    y_train = np.asarray(ratingsTrain, dtype=int)
    X_test = np.asarray(X_test)
    
    print("Running PCA on training data...")
    pca = PCA(n_components=50)
    pca.fit(X_train)
    X_transformed = pca.transform(X_train)
    print("PCA explained variance values:")
    print(pca.explained_variance_.round(2))
    
    
    print("Running k-fold cross validation...\n")
    kFoldRun = 0
    kf = KFold(n_splits=5, shuffle=True)
    for train_index, validation_index in kf.split(X_train):
        
        kFoldRun = kFoldRun + 1
        print("\n")
        print("K-FOLD RUN %s" % kFoldRun)
        print("\n")
        X_train_split, X_valid_split = X_train[train_index], X_train[validation_index]
        y_train_split, y_valid_split = y_train[train_index], y_train[validation_index]
        
#         start_time = time.time()
#         nb = NaiveBayes()
#         nb.fit(X_train_split, y_train_split)
#         ratingsPredicted = nb.predict(X_valid_split)
#         print("Accuracy for naive bayes: ", accuEval(ratingsPredicted, y_valid_split))
#         print("Runtime: %s seconds." % (time.time() - start_time))
#         print("\n")
        
#         start_time = time.time()
#         nbSK = BernoulliNB()
#         nbSK.fit(X_train_split, y_train_split)
#         ratingsPredictedSK = nbSK.predict(X_valid_split)
#         print("Accuracy for sklearn naive bayes: ", accuEval(ratingsPredictedSK, y_valid_split))
#         print("Runtime: %s seconds." % (time.time() - start_time))
#         print("\n")
        
#         start_time = time.time()
#         decTree = DecisionTreeClassifier(criterion='entropy', max_depth=15, min_samples_split=20, min_samples_leaf=10)
#         decTree.fit(X_train_split, y_train_split)
#         ratingsPredictedTree = decTree.predict(X_valid_split)
#         print("Accuracy for sklearn decision tree: ", accuEval(ratingsPredictedTree, y_valid_split))
#         print("Runtime: %s seconds." % (time.time() - start_time))
#         print("\n")
        
#         start_time = time.time()
#         lda = LinearDiscriminantAnalysis()
#         lda.fit(X_train_split, y_train_split)
#         ratingsPredictedLDA = lda.predict(X_valid_split)
#         print("Accuracy for sklearn LDA: ", accuEval(ratingsPredictedLDA, y_valid_split))
#         print("Runtime: %s seconds." % (time.time() - start_time))
#         print("\n")
        
#         start_time = time.time()
#         qda = QuadraticDiscriminantAnalysis()
#         qda.fit(X_train_split, y_train_split)
#         ratingsPredictedQDA = qda.predict(X_valid_split)
#         print("Accuracy for sklearn QDA: ", accuEval(ratingsPredictedQDA, y_valid_split))
#         print("Runtime: %s seconds." % (time.time() - start_time))
#         print("\n")

#         start_time = time.time()
#         lr = LogisticRegression()
#         lr.fit(X_train_split, y_train_split)
#         ratingsPredictedLR = lr.predict(X_valid_split)
#         print("Accuracy for sklearn LR: ", accuEval(ratingsPredictedLR, y_valid_split))
#         print("Runtime: %s seconds." % (time.time() - start_time))
#         print("\n")
        
        start_time = time.time()
        pca = PCA(n_components=50)
        lr = LogisticRegression()
        pipe = Pipeline([('pca', pca), ('logistic', lr)])
        pipe.fit(X_train_split, y_train_split)
        ratingsPredictedLR = pipe.predict(X_valid_split)
        print("Accuracy for sklearn LR with PCA: ", accuEval(ratingsPredictedLR, y_valid_split))
        print("Runtime: %s seconds." % (time.time() - start_time))
        print("\n\n\n")
        
        
        
        
#     print("Running predictions on test data for LR...\n")
    
    
    #nb = NaiveBayes()
    #nb.fit(X_train, y_train)
    #ratingsPredicted = nb.predict(X_test)
    
    #pca = PCA(n_components=500)
    #pipe = Pipeline([('pca', pca), ('logistic', lr)])
    
#     lr = LogisticRegression()
#     lr.fit(X_train, y_train)
#     ratingsPredicted = lr.predict(X_test)



#     predict_df = pd.DataFrame(ratingsPredictedLDA, columns=['sentiment'])
#     predict_df.index.names = ['id']
#     predict_df["sentiment"] = predict_df["sentiment"].apply(lambda sentiment: "negative" if sentiment == 0 else "positive")
#     predict_df.to_csv("submission.csv")
#     print("Done!")

Getting initial training and rating arrays...
Cleaning training review data...
                                                  review  sentiment
0      never watched movie style filming may consider...          0
1      robert lansing play scientist experimenting pa...          0
2      looking forward movie trustworthy actor intere...          0
3      start guy indian pot he cleaning suddenly skel...          0
4      happened basically solid plausible premise dec...          0
5      society heiress susan fletcher hopkins wealthy...          1
6      yesterday went alone cinema mexico time movie ...          1
7      first ever fully synchronized sound cartoon wa...          1
8      watched movie second time enjoyed much first t...          1
9      definitely outstanding musical great young sta...          1
10     one film consider film rendition improvement o...          1
11     transylvania insignificant occasionally funny ...          0
12     much like orson welles thirty 

Running PCA on training data...
PCA explained variance values:
[0.6  0.39 0.29 0.27 0.26 0.25 0.24 0.23 0.23 0.22 0.21 0.21 0.2  0.2
 0.2  0.19 0.19 0.18 0.18 0.18 0.18 0.17 0.17 0.17 0.17 0.17 0.17 0.16
 0.16 0.16 0.15 0.15 0.15 0.15 0.14 0.14 0.14 0.14 0.14 0.14 0.13 0.13
 0.13 0.13 0.13 0.13 0.13 0.13 0.12 0.12]
Running k-fold cross validation...



K-FOLD RUN 1






Accuracy for sklearn LR with PCA:  0.769
Runtime: 9.579733848571777 seconds.






K-FOLD RUN 2






Accuracy for sklearn LR with PCA:  0.7765
Runtime: 12.188390970230103 seconds.






K-FOLD RUN 3






Accuracy for sklearn LR with PCA:  0.7673333333333333
Runtime: 14.86371397972107 seconds.






K-FOLD RUN 4






Accuracy for sklearn LR with PCA:  0.7535
Runtime: 13.019276142120361 seconds.






K-FOLD RUN 5






Accuracy for sklearn LR with PCA:  0.7696666666666667
Runtime: 10.266211032867432 seconds.






In [253]:
# Uncomment one or the other (or both, if you feel like waiting!)

main()
# main_lstm()

In [254]:
def hyperparamTuning():
    
    trainFile = "train.csv"
    testFile = "test.csv"
    
    print("Getting initial training and rating arrays...")
    reviewsTrain, ratingsTrain = getInitTrainDataFrame(trainFile)
    
    print("Getting training matrix and feature array...")
    X_train, featureArr = getTrainMatrix(reviewsTrain)
    
    X_train = np.asarray(X_train.toarray())
    y_train = np.asarray(ratingsTrain, dtype=int)
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25)
    
    #max_depths = np.linspace(1, 32, 32, endpoint=True)
    #min_samples_splits = np.linspace(2, 100, 20, endpoint=True, dtype=int)
    min_samples_leafs = np.linspace(2, 100, 20, endpoint=True, dtype=int)
    #max_features = list(range(1,X_train.shape[1]))
    
    train_results = []
    test_results = []
    
    #for max_depth in max_depths:
    #for min_samples_split in min_samples_splits:
    for min_samples_leaf in min_samples_leafs:
    #for max_feature in max_features:
        
        #dt = DecisionTreeClassifier(max_depth=max_depth)
        #dt = DecisionTreeClassifier(min_samples_split=min_samples_split)
        dt = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
        #dt = DecisionTreeClassifier(max_features=max_feature)
        
        dt.fit(X_train, y_train)   
        train_pred = dt.predict(X_train)   
        false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
        roc_auc = auc(false_positive_rate, true_positive_rate)

        # Add auc score to previous train results
        train_results.append(roc_auc)   
        y_pred = dt.predict(X_test)   
        false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
        roc_auc = auc(false_positive_rate, true_positive_rate)
        # Add auc score to previous test results
        test_results.append(roc_auc)

    from matplotlib.legend_handler import HandlerLine2D
    
    #line1, = plt.plot(max_depths, train_results, 'b', label='Train AUC')
    #line2, = plt.plot(max_depths, test_results, 'r', label='Test AUC')
    
    #line1, = plt.plot(min_samples_splits, train_results, 'b', label='Train AUC')
    #line2, = plt.plot(min_samples_splits, test_results, 'r', label='Test AUC')
    
    line1, = plt.plot(min_samples_leafs, train_results, 'b', label='Train AUC')
    line2, = plt.plot(min_samples_leafs, test_results, 'r', label='Test AUC')
    
    #line1, = plt.plot(max_features, train_results, 'b', label='Train AUC')
    #line2, = plt.plot(max_features, test_results, 'r', label='Test AUC')
    
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel('AUC score')

    #plt.xlabel('Tree depth')
    #plt.xlabel('min samples split')
    plt.xlabel('min samples leaf')
    #plt.xlabel('max features')
    plt.show()

In [255]:
# hyperparamTuning()