## Import Dependencies

In [123]:
! pip install pandas
! pip install sklearn

import pandas as pd
from sklearn.model_selection import train_test_split



In [124]:
import pandas as pd
import numpy as np
import re
from stop_words import stop_words
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

## Get Vectorized Data

In [125]:
vectorized = pd.read_csv('vectorized.csv')

## Split the data

In [126]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    vectorized, vectorized['_Freshness'], test_size=0.3, random_state=42)

print('x_test')
print(x_test)


x_test
     volum  portray  sabotag  shoddi  clichridden  chang  bloat  compass  \
714      0        0        0       0            0      0      0        0   
605      0        0        0       0            0      0      0        0   
120      0        0        0       0            0      0      0        0   
208      0        0        0       0            0      0      0        0   
380      0        0        0       0            0      0      0        0   
..     ...      ...      ...     ...          ...    ...    ...      ...   
485      0        0        0       0            0      0      0        0   
405      0        0        0       0            0      0      0        0   
239      0        0        0       0            0      0      0        0   
135      0        0        0       0            0      0      0        0   
164      0        0        0       0            0      0      0        0   

     throwback  desol  ...  uncommon  turkish  ralston  clumsi  tight  \
714    

## Multinomial Naive Bayes Classifier

In [127]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB().fit(x_train, y_train)
predicted= clf.predict(x_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

print('Confusion Matrix', metrics.confusion_matrix(y_test, predicted))


MultinomialNB Accuracy: 0.8875968992248062
Confusion Matrix [[ 90  29]
 [  0 139]]


## Export Model

In [165]:
from sklearn.externals import joblib

joblib.dump(clf, 'clf_model.pkl')




['clf_model.pkl']

## Test model with individual input

### Cleaner functions for the input

In [128]:
def lower_case(line): return line.lower().strip()

In [129]:
def stem_words(line):
    ps = PorterStemmer()

    words = line.split()
    
    return_list = [ps.stem(word.strip()) for word in words]

    return ' '.join(return_list)

In [130]:
def remove_stop_words(line):

    words = line.split()
    
    kept_words = [word for word in words if word not in stop_words]

    return ' '.join(kept_words)

In [131]:
def remove_special_characters_and_numbers(line):
    return re.sub(r'([^a-zA-Z\s]+?)', '', line)

In [132]:
def get_irrelevant_words():
    
    # irrelevant words list was generated in a separate script that evaluated whether words were either
    # very low occurrence, or were similiarly represented in both positive and negative reviews.
    irrelevant_words_file = open('irrelevant_words.txt')    
    lines = irrelevant_words_file.readlines()
    irrelevant_words_set = {word.strip() for word in lines}
    irrelevant_words_file.close()
    
    return irrelevant_words_set

irrelevant_words = get_irrelevant_words()

def remove_irrelevant_words(line):
    words = line.split()
    kept_words = [word for word in words if word not in irrelevant_words]
    return ' '.join(kept_words)

In [147]:
def get_words_set(df): 
    df.dropna()
    word_set = set()
    for index, row in df.iterrows():
        try:
            review_words = row['Review'].split()
        except:
            continue
        for word in review_words:
            word = word.strip()
            if word not in irrelevant_words:
                word_set.add(word)
                
    return word_set

rootcleaned = pd.read_csv('cleaned.csv')
wordset = get_words_set(rootcleaned)
wordset.add('_Freshness') 

def create_row_dict(index, row, word_set):
    
    if index % 10000 == 0:
        print('processing index ', index, '.')
    
    try:
        row_words = set(row['Review'].split())
    except:
        row_words = set()
    
    return_dict = {header: (0, 1)[header in row_words] for header in word_set}
    #return_dict['_Freshness'] = row['Freshness']
    return return_dict


def vectorize(df):
           
    dict_list = [create_row_dict(index, row, wordset) for index, row in df.iterrows()]

    return_df = pd.DataFrame(dict_list)

    print(return_df.head())
    return return_df

In [143]:
def clean_data(df):
    df['Review'] = df['Review'].apply(lower_case)
    #print('Finished, lower_case: ')
    #get_time()
    df['Review'] = df['Review'].apply(remove_stop_words)
    #print('Finished, remove_stop_words: ')
    #get_time()
    df['Review'] = df['Review'].apply(remove_special_characters_and_numbers)
    #print('Finished, remove_special_characters_and_numbers: ')
    #get_time()
    df['Review'] = df['Review'].apply(stem_words)
    #print('Finished, stem_words: ')
    #get_time()
    
    df['Review'] = df['Review'].apply(remove_irrelevant_words)
    #print('Finished, remove_irrelevant_words: ')
    #get_time()
    
    df['Review'].replace('', np.nan, inplace=True)
    df.dropna(subset=['Review'], inplace=True)
    return df

    #df.to_csv('cleaned.csv', index=False)
    #print('Finished, cleaned to csv: ')
    #get_time()
    
#raw_df = pd.read_csv('truncated.csv')
# raw_df = pd.read_csv('rotten_tomatoes_reviews.csv')

#clean_data(raw_df)

In [135]:
def prepare_input_text(in_df):
    # This method will get the data in the correct format for testing the model
    cd = clean_data(in_df)
    vectorized = vectorize(cd)
    return vectorized


In [161]:
dfTest = pd.DataFrame()

dfTest = dfTest.append({"Review":"fail"}, ignore_index=True)
##stringTestNegative = "This is a test string to see how terrible our model does when guessing a negative outcome"

#### Clean the data before passing it into the model.  This will remove stop words, take out stubs and put the text into numerical values for the test

In [162]:
scrubbedInputTest = prepare_input_text(dfTest)

word_set size:  1118
processing index  0 .
   bittersweet  centuri  unnecessari  invent  exhilar  chadwick  limp  \
0            0        0            0       0        0         0     0   

   blackandwhit  tens  savour  ...  explor  utmost  loneli  meticul  classic  \
0             0     0       0  ...       0       0       0        0        0   

   lurch  frothi  stir  mirthless  societi  
0      0       0     0          0        0  

[1 rows x 1118 columns]


In [160]:
scrubbedInputTest.to_csv('scrubbedinput.csv')

In [163]:
result = clf.predict(scrubbedInputTest)

In [164]:
result[0]

0