# Introduction

The goal from this assignment is to make 5 classifier versions and see which performs best. The time it takes to prepare each model, the time it takes to run it, and the accuracy all count towards the final decision.

# Model 1 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy
from sklearn.decomposition import PCA
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler

import seaborn as sns

amazon = pd.read_csv(r'amazon_cells_labelled.txt', delimiter= '\t', header=None)
amazon.columns = ['text', 'sentiment']

# load positive & negative word corpora
positive, negative = [open(file,'r').read() for file in ['positive-words.txt','negative-words.txt']]

# remove description and make it a list
positive, negative = [corpus[corpus.rfind(';')+1:] for corpus in [positive, negative]]
positive, negative = [corpus.split('\n') for corpus in [positive, negative]]

# Remove whitespace characters
for corpus in [positive, negative]:
    for i in range(corpus.count('')):
        corpus.remove('')    

# add a couple of positive words    
positive.append('cool')
positive.append('decent')
    
# load list of stop words
with open('stop_words.txt','r') as file:
    stop = file.read()
stop = stop.split('\n')

# remove punctuation
def remove_punctuation(word):
    """ removes punctuation from a word"""
    
    punctuation = ''.join(['.',',',';',':','-','?','!','*'])
    TRANSDICT = str.maketrans(punctuation,' '*len(punctuation))
    return word.translate(TRANSDICT).strip().replace(' ','').replace('* ','')


def percent_positive(review):
    """ Tokenizes each sentence, checks for membership in positive words,
        makes sure positive words are not preceded by 'not'
    """
    
    # tokenize a sentence and remove punctuation
    tokenized = review.lower().split(' ')
    tokenized = [remove_punctuation(word) for word in tokenized]
    pcnt = 0
    
    # check for membership in poitive words list, making sure 'not' doesn't precede
    for word in tokenized:
        if tokenized.index(word) == 0 and (word in positive):
            pcnt += 1/len(tokenized)
        elif tokenized.index(word) == 1:
            if word in positive and (tokenized[tokenized.index(word)-1] != 'not'):
                pcnt += 1/len(tokenized)
        elif tokenized.index(word) > 1:
            if word in positive and (tokenized[tokenized.index(word)-1] != 'not') and (tokenized[tokenized.index(word)-2] != 'not'):
                pcnt += 1/len(tokenized)
    return pcnt




def percent_negative(review):
    """ Tokenizes each sentence, checks for membership in positive words,
        makes sure positive words are not perceded by 'not'
    """
    
    # tokenize a sentence and remove punctuation
    tokenized = review.lower().split(' ')
    tokenized = [remove_punctuation(word) for word in tokenized]
    pcnt = 0
    
    # check for membership in negative words list, making sure 'not' doesn't precede
    for word in tokenized:
        if tokenized.index(word) == 0 and word in negative:
            pcnt += 1/len(tokenized)
        elif tokenized.index(word) == 1:
            if word in negative and (tokenized[tokenized.index(word)-1] != 'not'):
                pcnt += 1/len(tokenized)
        elif tokenized.index(word) > 1:
            if word in negative and (tokenized[tokenized.index(word)-1] != 'not') and (tokenized[tokenized.index(word)-2] != 'not'):
                pcnt += 1/len(tokenized)
    return pcnt

# Apply percent_positive to the text column in our dataframe
amazon['positive'] = amazon['text'].apply(percent_positive)

# Apply percent_negative to the text column in our dataframe
amazon['negative'] = amazon['text'].apply(percent_negative)

# Load the Yelp dataset
yelp = pd.read_csv(r'yelp_labelled.txt', delimiter= '\t', header=None)
yelp.columns = ['text', 'sentiment']
yelp.head()

# Apply percent_positive to the text column in our dataframe
yelp['positive'] = yelp['text'].apply(percent_positive)

# Apply percent_negative to the text column in our dataframe
yelp['negative'] = yelp['text'].apply(percent_negative)

yelp.head()

# Initialize a model object
classifier = BernoulliNB()

# Fit our model to the data.
classifier.fit(amazon[['positive','negative']], amazon['sentiment'])

# Classify, storing the result in a new variable.
y_pred = classifier.predict(yelp[['positive','negative']])

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    yelp.shape[0],
    (yelp['sentiment'] != y_pred).sum()
))




Number of mislabeled points out of a total 1000 points : 185


# Model 2

In [2]:
amazon.head()

Unnamed: 0,text,sentiment,positive,negative
0,So there is no way for me to plug it in here i...,0,0.0,0.0
1,"Good case, Excellent value.",1,0.5,0.0
2,Great for the jawbone.,1,0.25,0.0
3,Tied to charger for conversations lasting more...,0,0.0,0.090909
4,The mic is great.,1,0.25,0.0


In [3]:
yelp.head()

Unnamed: 0,text,sentiment,positive,negative
0,Wow... Loved this place.,1,0.5,0.0
1,Crust is not good.,0,0.0,0.0
2,Not tasty and the texture was just nasty.,0,0.0,0.125
3,Stopped by during the late May bank holiday of...,1,0.133333,0.0
4,The selection on the menu was great and so wer...,1,0.083333,0.0


In [4]:
amazon2 = amazon.copy()
yelp2 = yelp.copy()
keywords = positive+negative+stop

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    amazon2[str(key)] = amazon2.text.str.contains(str(key),case=False)
    yelp2[str(key)] = yelp2.text.str.contains(str(key),case=False)
    
# Initialize a model object
classifier = BernoulliNB()

# Fit our model to the data.
classifier.fit(amazon2[[i for i in amazon2.columns if i not in ['text','sentiment']]], amazon2['sentiment'])

# Classify, storing the result in a new variable.
y_pred = classifier.predict(amazon2[[i for i in amazon2.columns if i not in ['text','sentiment']]])

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    yelp.shape[0],
    (yelp['sentiment'] != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 488


It took quite some time to load about 7000 columns into a dataframe. although we have kept the positive and negative columns, the accuracy went down. Let's take a look at the features more closely

# Model 3

In [5]:
# Get rid of categorical columns
amazon2_pca = amazon2[[i for i in amazon2.columns if i not in ['text','sentiment']]]

# Convert bool to binary numerical
def mapVals(val):
    if str(val)=='True':
        return 1
    return 0


for col in amazon2_pca.columns:
    amazon2_pca[col] = amazon2_pca[col].apply(mapVals)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [6]:
# PCA 
X = StandardScaler().fit_transform(amazon2_pca)
sklearn_pca = PCA(n_components=10,svd_solver='full')
Y_sklearn = sklearn_pca.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [7]:
pca_df = pd.DataFrame({'target':amazon['sentiment']})

for i in range(1,11):
    pca_df['pca{}'.format(i)] = Y_sklearn[:,i-1]

In [8]:
pca_df.corr().sort_values(by=['target'],ascending=False)

Unnamed: 0,target,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10
target,1.0,-0.08106893,-0.0238546,0.03199387,0.04827053,-0.06172985,0.03772506,0.05922058,0.05508795,-0.03228601,-0.02110798
pca7,0.059221,-6.409783e-17,-5.2153850000000004e-17,2.614631e-16,-1.139876e-16,-7.493917e-17,-2.038653e-16,1.0,4.256411e-16,4.258721e-17,-2.440636e-16
pca8,0.055088,3.397882e-16,-2.047047e-16,5.916474e-16,4.503321e-17,-4.838666e-16,1.208278e-16,4.256411e-16,1.0,-1.508837e-16,-1.5118940000000003e-17
pca4,0.048271,5.791732e-18,-1.933894e-17,-2.054222e-16,1.0,5.2714070000000004e-17,-8.623413000000001e-17,-1.139876e-16,4.503321e-17,3.984172e-17,-9.987099e-18
pca6,0.037725,-2.235223e-17,2.5750440000000003e-17,-4.742253e-17,-8.623413000000001e-17,3.648804e-16,1.0,-2.038653e-16,1.208278e-16,-1.069905e-16,2.125987e-18
pca3,0.031994,7.770912000000001e-17,3.2786890000000004e-17,1.0,-2.054222e-16,-1.460058e-17,-4.742253e-17,2.614631e-16,5.916474e-16,1.045243e-16,3.087695e-17
pca10,-0.021108,5.856695e-17,-2.2258980000000002e-17,3.087695e-17,-9.987099e-18,-3.963376e-16,2.125987e-18,-2.440636e-16,-1.5118940000000003e-17,-3.505914e-16,1.0
pca2,-0.023855,-1.907503e-16,1.0,3.2786890000000004e-17,-1.933894e-17,3.82622e-17,2.5750440000000003e-17,-5.2153850000000004e-17,-2.047047e-16,-2.1029250000000002e-17,-2.2258980000000002e-17
pca9,-0.032286,7.956466e-17,-2.1029250000000002e-17,1.045243e-16,3.984172e-17,2.204879e-17,-1.069905e-16,4.258721e-17,-1.508837e-16,1.0,-3.505914e-16
pca5,-0.06173,-3.171678e-17,3.82622e-17,-1.460058e-17,5.2714070000000004e-17,1.0,3.648804e-16,-7.493917e-17,-4.838666e-16,2.204879e-17,-3.963376e-16


In [9]:
#repeat classification with new df

# Fit our model to the data.
classifier.fit(pca_df[[i for i in pca_df.columns if i != 'target']], pca_df['target'])

# Classify, storing the result in a new variable.
y_pred = classifier.predict(pca_df[[i for i in pca_df.columns if i != 'target']])

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    yelp.shape[0],
    (yelp['sentiment'] != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 496


# Model 4

That did not help. Would it if we limit PCA components to 3?

In [10]:
# # PCA 
X = StandardScaler().fit_transform(amazon2_pca)
sklearn_pca = PCA(n_components=2,svd_solver='full')
Y_sklearn = sklearn_pca.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [11]:
pca_df = pd.DataFrame({'target':amazon['sentiment']})

for i in range(1,3):
    pca_df['pca{}'.format(i)] = Y_sklearn[:,i-1]
    
pca_df.head()

Unnamed: 0,target,pca1,pca2
0,0,2.377261,-1.61957
1,1,-3.279472,1.721639
2,1,-1.204156,0.817193
3,0,1.297146,-1.067836
4,1,-1.41559,0.65902


In [12]:
#repeat classification with new df

# Fit our model to the data.
classifier.fit(pca_df[[i for i in pca_df.columns if i != 'target']], pca_df['target'])

# Classify, storing the result in a new variable.
y_pred = classifier.predict(pca_df[[i for i in pca_df.columns if i != 'target']])

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    yelp.shape[0],
    (yelp['sentiment'] != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 533


It seems like the model is becoming worse as proceed in feature engineering. This may be due to the original features (word columns) not being fit as features for prediction, especially with their large number. What if we create a simple positivity score calculated from words in a sentence being a positive/negative word list?

In [18]:
def cal_perc(review):
    perc = 0
    for word in review.split(' '):
        if word in positive:
            perc += 1/len(review.split(' '))
        elif word in negative:
            perc -= 1/len(review.split(' '))
    return perc

# remove the original positive and negative columns
amazon_5 = amazon[['text','sentiment']]
yelp_5 = yelp[['text','sentiment']]

# Calculate percentage of word membership in pos & neg lists
amazon_5['perc'] = amazon_5['text'].apply(cal_perc)
yelp_5['perc'] = yelp_5['text'].apply(cal_perc)
yelp_5.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,text,sentiment,perc
0,Wow... Loved this place.,1,0.0
1,Crust is not good.,0,0.0
2,Not tasty and the texture was just nasty.,0,0.0
3,Stopped by during the late May bank holiday of...,1,0.133333
4,The selection on the menu was great and so wer...,1,0.083333


In [22]:
# Initialize a model object
classifier = BernoulliNB()

# Fit our model to the data.
classifier.fit(amazon_5[['perc']], amazon_5['sentiment'])

# Classify, storing the result in a new variable.
y_pred = classifier.predict(yelp_5[['perc']])

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    yelp.shape[0],
    (yelp['sentiment'] != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 321


Do any of your classifiers seem to overfit?

Yes, I did not implement the function to count for words like "not" and "not so" counting before a positive and negative word. The models had learned from each of the 7000 words being in columns that the word's existence in the sentence means positive or negative, which is not true.

Which seem to perform the best? Why?

The final model had higher accuracy with only 1 feature compared to models with thousands of features. This may be due to the other models learning from the noise caused by unnecessary data. I believe the last model encompasses the negativity and positivity of a sentence better than the others, having understood the real sentiment better than having many columns do the same job.


Which features seemed to be most impactful to performance?

They were the features where positivity and negativity was calculated or *engineered* to show one value that describes these measurements.