# Text Analytics Group Assignment 2

Nicole Erich, Vishwa Bhuta, Caroline Nelson, Erik Honore, Lindsay Tober

---
# Pre-work
---

### [Setup]

In [1]:
# Import Statements - Basic
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# Import Statements - nltk
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import collocations
from nltk import sentiment
from nltk.stem import WordNetLemmatizer
from nltk.chunk.regexp import *

# # Just in case
# nltk.download()

# Import Statements - sklearn
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Import Statements - PMI calculations
import re
import math
from math import log
from decimal import Decimal
from collections import defaultdict
from django.utils.encoding import smart_str, smart_unicode
import string

# Import Statements - Modeling & Other
from patsy import dmatrices
import scipy
from scipy import sparse
from scipy.sparse import coo_matrix, hstack
from collections import Counter
import os
import random
from operator import itemgetter
import csv
import functools32

# Import Statements - Warnings
import warnings
warnings.filterwarnings('ignore')

%pylab inline



Populating the interactive namespace from numpy and matplotlib


In [4]:
data=pd.read_csv('Yelp Data Restaurant Reviews Ratings.csv')

---
# Task A.

### Ignore the text (reviews) and run a classification model with the numeric data (you can use standard methods like logistic regression, k-nearest neighbors or anything else). What is the best accuracy of your model with numeric data?

---


To understand the data better, we first wanted to know if the different cuisine types are a dummy variable of "cuisine," or if a restaurant could be categorized into multiple.  Finding that the second option was often true, we allowed for each cuisine type to act as its own feature.  Similarly, we also confirmed that each review could only select one price range.  We confirmed that there were no null values to contend with, and created a new "Target" column that assigned a value of "high" or "low" to each review based on the star rating.

### [Data Validation]

In [5]:
data.columns.values

array(['stars', 'votes_cool', 'votes_funny', 'votes_useful', 'Cheap',
       'Moderate', 'Expensive', 'VeryExpensive', 'American', 'Chinese',
       'French', 'Japanese', 'Indian', 'Italian', 'Greek', 'Mediterranean',
       'Mexican', 'Thai', 'Vietnamese', 'Others', 'Review'], dtype=object)

In [6]:
data['stars'].value_counts()

4    7395
5    6158
3    3353
2    1940
1    1153
Name: stars, dtype: int64

In [7]:
# Check number of cuisines listed per review
cuisine_list = list(data.iloc[:,8:20])
data['sumcuisine'] = data[cuisine_list].sum(axis=1)
checkcuisinesperreview = data['sumcuisine'].value_counts()
print 'Cuisine Validation'
print checkcuisinesperreview

Cuisine Validation
1    17307
2     2343
3      331
4       18
Name: sumcuisine, dtype: int64


In [8]:
# Check number of prices listed per review
price_list = list(data.iloc[:,4:8])
data['sumprice'] = data[price_list].sum(axis=1)
checkpriceperreview = data['sumprice'].value_counts()
print 'Price Validation'
print checkpriceperreview

Price Validation
1    19999
Name: sumprice, dtype: int64


In [9]:
# Check for null values
print 'Null Validation'
for i in range(0,len(data.columns.values)):
    print "Column", i,
    print sum(pd.isnull(data.ix[:,i]))

Null Validation
Column 0 0
Column 1 0
Column 2 0
Column 3 0
Column 4 0
Column 5 0
Column 6 0
Column 7 0
Column 8 0
Column 9 0
Column 10 0
Column 11 0
Column 12 0
Column 13 0
Column 14 0
Column 15 0
Column 16 0
Column 17 0
Column 18 0
Column 19 0
Column 20 0
Column 21 0
Column 22 0


In [10]:
# Set 'target' to 1 for 4* and 5* reviews and 0 for 3* and below
# Validate results
data['target'] = 0.0
data['target'][data['stars'] > 3] = 1.0
print data['target'].value_counts()
check_target = pd.crosstab(data['stars'],data['target'])
check_target

1.0    13553
0.0     6446
Name: target, dtype: int64


target,0.0,1.0
stars,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1153,0
2,1940,0
3,3353,0
4,0,7395
5,0,6158


### [Model]

In [11]:
# Create dmatrices, select only numerical values for X and target column for Y
X_A = data.ix[:,1:20]
y_A = data['target']
X_A.columns.values

array(['votes_cool', 'votes_funny', 'votes_useful', 'Cheap', 'Moderate',
       'Expensive', 'VeryExpensive', 'American', 'Chinese', 'French',
       'Japanese', 'Indian', 'Italian', 'Greek', 'Mediterranean',
       'Mexican', 'Thai', 'Vietnamese', 'Others'], dtype=object)

In [12]:
# Split test / train data
X_train_LR, X_test_LR, y_train_LR, y_test_LR = train_test_split(X_A, y_A, 
                                                                test_size=0.5, 
                                                                random_state=20)

In [13]:
# Create and fit a logistic regression model
regr = linear_model.LogisticRegression()
regr.fit(X_train_LR, y_train_LR)

# Use model to make predictions
predictions_LR = regr.predict(X_test_LR)

In [14]:
# Get predictions
predictions_LR

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [15]:
# Print Confusion matrix
confusion_matrix_LR = metrics.confusion_matrix(y_test_LR, predictions_LR, labels=unique(y_A))
print confusion_matrix_LR

[[ 160 3065]
 [ 101 6674]]


In [16]:
# Accuracy score
print metrics.accuracy_score(y_test_LR, predictions_LR)

0.6834


In [17]:
# Baseline accuracy
baseline_LR = round(1.0*sum([y_train_LR == 1])/len(y_train_LR),4)

print "There are", sum([y_train_LR == 1]) - sum([y_train_LR == 0]), \
    "more high than low, so we predict \"High.\""

print "Baseline accuracy would be", baseline_LR

There are 3557 more high than low, so we predict "High."
Baseline accuracy would be 0.6779


In [18]:
# Logistic regression model comparison to baseline
print "The logistic regression model beats the baseline model by", \
    metrics.accuracy_score(y_test_LR, predictions_LR) - baseline_LR

The logistic regression model beats the baseline model by 0.0055


## | Task A - Results |
We trained a logistic regression model on a subset of data to predict either high or low.  On a test set, we achieved an accuracy of approximately 68.34% using this method.  For a comparison, a baseline accuracy would be to only predict the majority class ('High').  This baseline model on a given test set achieves approximately 67.75% accuracy.  The logistic regression model using only numeric data (ignoring the review text) performs only slightly better than the baseline.

---
# Task B.

### Perform a supervised classification on a subset of the corpus using the reviews only. You can write your code in Python or R. What accuracy do you get from this text mining exercise?

---



### [Unigram Method]

In [19]:
# Sample for faster processing
random.seed(20)
data_sample = data.sample(n=5000,replace=False)

In [20]:
# Create dmatrices, select only reviews for X and target column for Y
X_B = data_sample['Review']
y_B = data_sample['target']

In [21]:
# Split train and test
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B, y_B, 
                                                            test_size=0.25, 
                                                            random_state=20)

# Create arrays for vectorizer input
X_train_B = np.array(X_train_B)
y_train_B = np.array(y_train_B)
X_test_B = np.array(X_test_B)
y_test_B = np.array(y_test_B)

In [22]:
# Build vectorizor to get TF-IDF scores
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1))

In [23]:
# Transform X's for training and test sets
X_train_UM = vectorizer.fit_transform(X_train_B)
X_test_UM = vectorizer.transform(X_test_B)

In [24]:
X_train_UM.shape

(3750, 16803)

In [25]:
# Fit Naive Bayes Multinomial
NBclassifier_UM = MultinomialNB().fit(X_train_UM, y_train_B)

In [26]:
# Use NB to get predictions
predictions_nb_UM = NBclassifier_UM.predict(X_test_UM)

In [27]:
# Print Confusion matrix
confusion_matrix_UM = metrics.confusion_matrix(y_test_B, predictions_nb_UM, labels=unique(y_B))
print confusion_matrix_UM

[[  2 399]
 [  0 849]]


In [28]:
# Accuracy for NB all words
accuracy_UM = metrics.accuracy_score(y_test_B, predictions_nb_UM)
print accuracy_UM

0.6808


### [Lemmatization]

In [29]:
# Building tokenizer and lemmatizer
lemmatizer=WordNetLemmatizer()

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    lemmas = []
    for item in tokens:
        lemmas.append(lemmatizer.lemmatize(item))
    return lemmas

In [30]:
# Tokenize training and test set X's
tfidf = TfidfVectorizer(tokenizer=tokenize)
X_train_TFIDF = tfidf.fit_transform(X_train_B)
X_test_TFIDF = tfidf.transform(X_test_B)

In [31]:
# Fit naive bayes multinomial model with tokenized training set
NBclassifier_TFIDF = MultinomialNB().fit(X_train_TFIDF, y_train_B)
predictions_nb_TFIDF = NBclassifier_TFIDF.predict(X_test_TFIDF)

In [32]:
# Print confusion matrix for NB with tokenization
confusion_matrix_TFIDF = metrics.confusion_matrix(y_test_B, predictions_nb_TFIDF)
print confusion_matrix_TFIDF

[[  1 400]
 [  0 849]]


In [33]:
# Get accuracy score for NB with tokenization
accuracy_TFIDF = metrics.accuracy_score(y_test_B, predictions_nb_TFIDF)
print accuracy_TFIDF

0.68


### [Remove Stop-Words]

In [34]:
# Create vectorizer that removes stop words
vectorizer_stop = TfidfVectorizer(min_df=0.05, 
                                  ngram_range=(1, 1), 
                                  stop_words='english', 
                                  strip_accents='unicode', 
                                  norm='l2')

In [35]:
# Transform X's for training and test sets (same but with stop words)
X_train_RSW = vectorizer_stop.fit_transform(X_train_B)
X_test_RSW = vectorizer_stop.transform(X_test_B)

In [36]:
# Fit Naive Bayes Multinomial
NBclassifier_RSW = MultinomialNB().fit(X_train_RSW, y_train_B)

In [37]:
# Use NB to get predictions
predictions_nb_RSW = NBclassifier_RSW.predict(X_test_RSW)

In [38]:
# Print Confusion matrix
confusion_matrix_RSW = metrics.confusion_matrix(y_test_B, predictions_nb_RSW, 
                                                labels=unique(y_B))
print confusion_matrix_RSW

[[ 83 318]
 [ 17 832]]


In [39]:
# Accuracy for NB no stop words
accuracy_RSW = metrics.accuracy_score(y_test_B, predictions_nb_RSW)
print accuracy_RSW

0.732


### [Part-of-Speech Bigrams]

In [40]:
# POS Tag Pattern
tag_pattern = "<NN.*><VB.*>|<JJ><NN>|<NNP><VB>|<NNP><VBP>|<NN><VB>|<VB><NNP>|<VB><NN>"
regexp_pattern = tag_pattern2re_pattern(tag_pattern)
regexp_pattern

u'(<(NN[^\\{\\}<>]*)>)(<(VB[^\\{\\}<>]*)>)|(<(JJ)>)(<(NN)>)|(<(NNP)>)(<(VB)>)|(<(NNP)>)(<(VBP)>)|(<(NN)>)(<(VB)>)|(<(VB)>)(<(NNP)>)|(<(VB)>)(<(NN)>)'

In [41]:
# Tokenize Function
def tokenize1(text):
    tokens_unigrams = nltk.word_tokenize(text)
    bigram_tokenizer = nltk.tokenize.regexp.RegexpTokenizer(regexp_pattern)
    tokens_bigrams = bigram_tokenizer.tokenize(text)
    tokens= tokens_unigrams+tokens_bigrams
    return tokens

In [42]:
# Get POS Bigrams
tfidf_bigram = TfidfVectorizer(tokenizer=tokenize1)
X_train_POSBG = tfidf_bigram.fit_transform(X_train_B)
X_test_POSBG = tfidf_bigram.transform(X_test_B)

In [43]:
# Predict using POS Bigrams
NBclassifier_POSBG = MultinomialNB().fit(X_train_POSBG, y_train_B)
predictions_nb_POSBG = NBclassifier_POSBG.predict(X_test_POSBG)

In [44]:
# Print Confusion matrix
confusion_matrix_POSBG = metrics.confusion_matrix(y_test_B, predictions_nb_POSBG)
print confusion_matrix_POSBG

[[  1 400]
 [  0 849]]


In [45]:
# Accuracy for NB POS Bigrams
accuracy_POSBG = metrics.accuracy_score(y_test_B, predictions_nb_POSBG)
print accuracy_POSBG

0.68


## | Task B - Results |
For part B, we applied different approaches of text mining to build multiple models.  Overall, we found that Naive Bayes performed better on average than logistic regression, so each text mining example uses a NB to test the accuracy.  The methods tested were: all unigrams, lemmatization, removing stop words (and limiting min-df to 0.05), a stop words/limiting min-df and lemmatization model, and part-of-speech bigrams.  Of all of these methods, the removing stop words model consistently performed the best.  Adding lemmatization to this method complicated the model without increasing accuracy, so it is not the final choice.

We found that the biggest impact to accuracy was changing the min-df constraint in the stop-words model.  It was able to remove words that were only in 5% of all reviews, thereby reducing complexity as well as increasing accuracy.

---
# Task C. 

### Combine the numeric data and the text classification model (in task B) to create a “hybrid” model. It is your task to figure out how to do this. Now run this hybrid classification model and compare the results with those in A and B. Does the numeric data add to the predictive power relative to text?

---

In [46]:
# Set X and y for model
X_C = data_sample.ix[:,1:21]
y_C = data_sample['target']
X_C.columns.values

array(['votes_cool', 'votes_funny', 'votes_useful', 'Cheap', 'Moderate',
       'Expensive', 'VeryExpensive', 'American', 'Chinese', 'French',
       'Japanese', 'Indian', 'Italian', 'Greek', 'Mediterranean',
       'Mexican', 'Thai', 'Vietnamese', 'Others', 'Review'], dtype=object)

In [47]:
# Split train and test
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C, y_C, 
                                                            test_size=0.25, 
                                                            random_state=20)

# Create arrays for vectorizer input
X_train_C = np.array(X_train_C)
y_train_C = np.array(y_train_C)
X_test_C = np.array(X_test_C)
y_test_C = np.array(y_test_C)

In [48]:
# Transform X's for training and test sets with stop words
X_train_TXT = vectorizer_stop.fit_transform(X_train_C[:,-1])
X_test_TXT = vectorizer_stop.transform(X_test_C[:,-1])

In [49]:
# Validate X_train dimensions....
X_train_C[:,0:-1].shape

(3750, 19)

In [50]:
# is congruent with X_train3 dimensions (rows)
X_train_TXT.shape

(3750, 189)

In [51]:
# Turn all into sparse (coo) matrices

A = coo_matrix(X_train_C[:,0:-1])
B = coo_matrix(X_train_TXT)

C = coo_matrix(X_test_C[:,0:-1])
D = coo_matrix(X_test_TXT)

In [52]:
# Combine numeric and vectorized text data into one matrix
X_train_HYBRID = sparse.hstack([A.astype(float),B]).toarray()
X_test_HYBRID = sparse.hstack([C.astype(float),D]).toarray()

In [53]:
# Check the resulting X_train_all dimensions for desired result
X_train_HYBRID.shape

(3750, 208)

### [NB]

In [54]:
# Fit and predict using a Multinomial NB approach
NBclassifier_HYBRID = MultinomialNB().fit(X_train_HYBRID, y_train_C)
predictions_HYBRID_nb = NBclassifier_HYBRID.predict(X_test_HYBRID)

In [55]:
# Print the confusion matrix for NB on HYBRID model
confusion_matrix_HYBRID_nb = metrics.confusion_matrix(y_test_C, predictions_HYBRID_nb)
print confusion_matrix_HYBRID_nb

[[107 294]
 [ 39 810]]


In [56]:
# Print accuracy score for NB on HYBRID model
accuracy_HYBRID_nb = metrics.accuracy_score(y_test_C, predictions_HYBRID_nb)
print accuracy_HYBRID_nb

0.7336


### [KNN]

In [57]:
# Fit and predict using a KNN model
KNNclassifier_HYBRID = KNeighborsClassifier().fit(X_train_HYBRID, y_train_C) 
predictions_HYBRID_knn = KNNclassifier_HYBRID.predict(X_test_HYBRID)

In [58]:
# Print the confusion matrix for KNN on HYBRID data
confusion_matrix_HYBRID_knn = metrics.confusion_matrix(y_test_C, predictions_HYBRID_knn)
print confusion_matrix_HYBRID_knn

[[108 293]
 [116 733]]


In [59]:
# Print the accuracy score for KNN on HYBRID data
accuracy_HYBRID_knn = metrics.accuracy_score(y_test_C, predictions_HYBRID_knn)
print accuracy_HYBRID_knn

0.6728


## | Task C - Results |
To create a hybrid model, we first decided to simply attach the review features as additional columns to the dataset of numerical values for each row.  After selecting the remove-stop-words model in Part B as the most accurate, this is how we transformed both the training and test set X's.

On this first hybrid model, we ran both a Naive Bayes model as well as a K-nearest-neighbors, to ensure that NB would still be a good selection for a hybrid dataset.  As expected, Naive Bayes performed much better. For a particular random split, Naive Bayes predicted with a 75.6% accuracy, while KNN only predicted 66.2% correctly.

To improve this model, we hypothesized that the approximately 200 columns generated by the text vectorizer may be "drowning" the data from the first 19 numerical columns.   We decided to use predictions generated from text data to form an additional column to the original 19.  To do this, we used the Naive Bayes fit from the removing-stop-words vectorizer to create the new prediction column.  With a new dataset of just 20 columns, we made our final predictions with both Naive Bayes and a Logistic Regression. In this case, Logistic Regression performed better.  Using this model, we achieved 74.24% accuracy on the test set.  Unfortunately, it did not beat the previous method as expected.

It's possible that the second model under-weighted the effect of the text reviews in comparing it equally with the other feature columns.  To improve the model, we would need to find the optimal balance between the numeric and text features.

---
# Task D. 

### Use unsupervised sentiment analysis on the reviews (with SentiStrength or any other tool) and use the sentiment scores to predict high/low rating. Compare and contrast the results of tasks B and D. What can you conclude from your analysis?

---

In [3]:
# Import data with SentiStrength analysis ('Difference' column)
data_SS = pd.read_csv('Yelp Data Restaurant Reviews Ratings (1).csv')

# View columns to confirm 'Difference' imported
data_SS.columns.values

array(['stars', 'votes_cool', 'votes_funny', 'votes_useful', 'Cheap',
       'Moderate', 'Expensive', 'VeryExpensive', 'American', 'Chinese',
       'French', 'Japanese', 'Indian', 'Italian', 'Greek', 'Mediterranean',
       'Mexican', 'Thai', 'Vietnamese', 'Others', 'Review', 'Difference'], dtype=object)

In [4]:
# Set 'target' to 1 for 4* and 5* reviews and 0 for 3* and below
# Validate results
data_SS['target'] = 0.0
data_SS['target'][data_SS['stars'] > 3] = 1.0
print data_SS['target'].value_counts()
check_target_SS = pd.crosstab(data_SS['stars'],data_SS['target'])
check_target_SS

1.0    13553
0.0     6446
Name: target, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


target,0.0,1.0
stars,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1153,0
2,1940,0
3,3353,0
4,0,7395
5,0,6158


### [Logistic Regression]

In [5]:
# Create test and train sets
train_SS, test_SS = train_test_split(data_SS, test_size=.33, random_state=11)

In [8]:
# Create matrices for test and train sets difference calculation variables
y_train_SS,X_train_SS = dmatrices('target~0+Difference', train_SS)
y_test_SS,X_test_SS = dmatrices('target~0+Difference', test_SS)

In [9]:
# Train Logistic Regression model and predict on test set
LRclassifier_SS = LogisticRegression().fit(X_train_SS,y_train_SS)
predictions_SS_lr = LRclassifier_SS.predict(X_test_SS)

In [10]:
# Print the confusion matrix for Logistic Regression on SentiStrength data
confusion_matrix_SS_lr = metrics.confusion_matrix(y_test_SS, predictions_SS_lr)
print confusion_matrix_SS_lr

[[   0 2168]
 [   0 4431]]


In [11]:
# Print the accuracy score for Logistic Regression on SentiStrength data
accuracy_SS_lr = metrics.accuracy_score(y_test_SS, predictions_SS_lr)
print accuracy_SS_lr

0.671465373541


## | Task D - Results |
We decided to use the difference between positive and negative sentiment scores to determine the overall positive or negative score of each review (column named 'Difference').  We then ran a logistic regression with only the 'Difference' column against high and low ratings, and saw an accuracy of 67.14% on the test subset.  In Part B, using Naive Bayes while removing stopwords, and removing words that did not appear in 5% of the documents, the accuracy score was between 68% and 74%.  The accuracy score could have dropped due to the lack of data behind the classification, and skewed by the number of high and low ratings in the original data set.  Using fewer words to calculate the overall sentiment score could have benefitted our logistic regression model.    

---
# Task E. 

### Implement the PMI approach to sentiment analysis (in either Python or R), and run the classification model with the sentiment scores. How do your results compare with those in Task D?

---

In [12]:
# Get subset for analysis
random.seed(20)
data_SS_subset = data_SS.sample(n=2000,replace=False)
data_SS_subset = data_SS_subset.reset_index()

In [13]:
# Bag of words for subset of data
list_of_words=[]
for i in range(len(data_SS_subset)):
    new_text = data_SS_subset['Review'][i].decode('utf-8')
    text = word_tokenize(new_text)
    list_of_words+=text;

### [Prepare Sentiment Reference Terms]

In [14]:
# Read in text files with reference terms for positive and negative sentiments
refpos = pd.read_csv('positive-words.txt',header=None)
refneg = pd.read_csv('negative-words.txt',header=None)

In [15]:
# Turn reference words into lists
refposList= refpos[0].tolist()
refnegList= refneg[0].tolist()

In [16]:
# Creates the lists of positive and negative reference words that occur in our corpus
posMatches = list(set(list_of_words).intersection(set(refposList)))
negMatches = list(set(list_of_words).intersection(set(refnegList)))

### [Prepare Part-of-Speech Bigrams]

In [17]:
# Get lowercase version of reviews for consistency
data_SS_subset['reviews'] = data_SS_subset['Review'].apply(lambda t: filter(lambda x: x in string.printable, t))
data_SS_subset['reviews'] = data_SS_subset['reviews'].apply(lambda t: t.lower())

In [18]:
# Function to tag each word with POS
def tag_review(review):
    new_text= review
    text = word_tokenize(new_text)
    tagged = nltk.pos_tag(text)
    #tagged = [word + "/" + tag for (word, tag) in tagged]
    #string = ' '.join(tagged)
    return tagged

In [19]:
# Apply POS tagger function to Review column
data_SS_subset['TaggedReview'] = data_SS_subset['reviews'].apply(tag_review)

In [20]:
# Function to get list of applicable bigrams
def get_bigrams(review):
    list_of_bigrams=[]
    for (w1,t1), (w2,t2) in nltk.bigrams(review):
        if (t1.startswith('J') and t2.startswith('N')):
            list_of_bigrams.append((w1, w2))
        elif (t1.startswith('R') and t2.startswith('J')):
            list_of_bigrams.append((w1, w2))
        elif (t1.startswith('N') and t2.startswith('J')):
            list_of_bigrams.append((w1, w2))
    return list_of_bigrams

In [21]:
# Apply Bigrams function to Tagged Reviews column
list_of_bigrams = data_SS_subset['TaggedReview'].apply(get_bigrams)

### [Prepare Training Set]

In [22]:
# Creates train and test data on SentiStrength subset
train_SSsubset, test_SSsubset = train_test_split(data_SS_subset,
                                                 test_size = .33,
                                                 random_state = 16)

train_SSsubset = train_SSsubset.reset_index()
test_SSsubset = test_SSsubset.reset_index()

In [23]:
# Pull values from 'Review' column
train_SSsubset_reviewslist = train_SSsubset['reviews'].values

In [24]:
# Removes words that are in less than 5% and more than 98% of docs
train_vect = CountVectorizer(min_df=0.05, max_df=0.98, stop_words='english')

In [25]:
# Creating count matrix for training set
train_CountVect = train_vect.fit_transform(train_SSsubset_reviewslist)

In [26]:
# Converts count matrix into count dataframe
train_count_df = pd.SparseDataFrame([pd.SparseSeries(train_CountVect[i].toarray().ravel()) 
                                     for i in np.arange(train_CountVect.shape[0])], 
                                    columns = train_vect.get_feature_names())

In [27]:
# Funtion to get counts of bigrams in each document
def createBinaryTerms(counts):
    x = 0.0
    if counts>0:
        x=1.0
    return x

train_binary_df = train_count_df.applymap(createBinaryTerms)

In [28]:
# Dictionary of how many documents each term occurs in
train_counts_dict = (train_binary_df.apply(sum)).to_dict()
train_unigramsList = train_counts_dict.keys()

In [29]:
# Get frequency of all ref. pos. and ref. neg words
train_vectAll = CountVectorizer(min_df=0, stop_words='english')

In [30]:
# Running code with all words (not removing words based on frequency) 
train_CountVectAll = train_vectAll.fit_transform(train_SSsubset_reviewslist)

In [31]:
# Create sparse dataframe using vectorizer
train_count_df_all = pd.SparseDataFrame([pd.SparseSeries(train_CountVectAll[i].toarray().ravel()) 
                                         for i in np.arange(train_CountVectAll.shape[0])], 
                                        columns = train_vectAll.get_feature_names())

In [32]:
# Create a dictionary with all term counts
train_binary_df_all = train_count_df_all.applymap(createBinaryTerms)
train_counts_dict_all = (train_binary_df_all.apply(sum)).to_dict()

In [33]:
# Creates list of positive and negative reference words that appear in the training set
posMatches_train = list(set(train_counts_dict_all.keys()).intersection(set(posMatches)))
negMatches_train = list(set(train_counts_dict_all.keys()).intersection(set(negMatches)))

In [34]:
# Dictionary of how many docs the reference negative and reference positive words appear in 
# Only within training set (vs whole corpus)
refWordsDictTrain = {}
for x in train_counts_dict_all.keys():
    if x in posMatches_train:
        refWordsDictTrain[x] = train_counts_dict_all[x]
    elif x in negMatches_train:
        refWordsDictTrain[x] = train_counts_dict_all[x]

In [35]:
# Creates new column with tagged review
train_SSsubset['TaggedReview'] = train_SSsubset['reviews'].apply(tag_review)

In [36]:
# Creates list of tagged bigrams
list_of_train_bigrams = train_SSsubset['TaggedReview'].apply(get_bigrams)
list_of_train_bigrams

0       [(so, good), (pretty, good), (more, items), (s...
1       [(chelsea, s), (short, rib), (tacos, green), (...
2       [(first, time), (here.my, husband), (good, foo...
3                           [(so, glad), (smash, burger)]
4       [(nothing, huge), (nice, alternative), (ever, ...
5                        [(cute, place), (tiny, parking)]
6       [(mexican, food), (mexican, joints), (cheese, ...
7       [(i, wish), (right, things), (very, high), (hi...
8       [(good, restaurant), (southside, counterpart),...
9       [(awesome, weekend), (brunch, best), (best, bl...
10      [(italian, deli), (deli, counter), (entire, ba...
11      [(super, l), (kong, express), (sam, woo), (3-i...
12      [(authentic, hole), (so, make), (make, sure), ...
13                                    [(very, impressed)]
14      [(really, impressed), (il, posto), (mediocre, ...
15      [(ubiquitous, gringos), (american-mexican, joi...
16      [(bad, food), (food, bad), (bad, service), (so...
17      [(favo

In [37]:
# Turning series of lists of bigrams into one list
trainBigramsAsList = []
def addBigrams(bigramList):
    for i in bigramList:
        trainBigramsAsList.append(i)

list_of_train_bigrams.map(addBigrams)
trainBigramsAsList

[('so', 'good'),
 ('pretty', 'good'),
 ('more', 'items'),
 ('so', 'good.kind'),
 ('small', 'place'),
 ('good', 'thing'),
 ('chelsea', 's'),
 ('short', 'rib'),
 ('tacos', 'green'),
 ('chile', 'burger'),
 ('salted', 'chocolate'),
 ('way', 'out.i'),
 ('out.i', 'd'),
 ('only', 'gripe'),
 ('strong', 'menu'),
 ('first', 'time'),
 ('here.my', 'husband'),
 ('good', 'food'),
 ('fried', 'onion'),
 ('red', 'pepper'),
 ('decent', 'selection'),
 ('good', 'tenderloin'),
 ('just', 'ok.'),
 ('i', 'm'),
 ('basic', 'mustard'),
 ('onion.my', 'husband'),
 ('next', 'time'),
 ('time', 'i'),
 ('i', 'll'),
 ('particular', 'day'),
 ('pretty', 'slow'),
 ('very', 'friendly'),
 ('understandable.the', 'prices'),
 ('pretty', 'reasonable'),
 ('too', 'bad'),
 ('very', 'full'),
 ('totally', 'comfortable'),
 ('pretty', 'much'),
 ('so', 'glad'),
 ('smash', 'burger'),
 ('nothing', 'huge'),
 ('nice', 'alternative'),
 ('ever', 'crap-tastic'),
 ('pita', 'jungle'),
 ('cute', 'place'),
 ('tiny', 'parking'),
 ('mexican', 'food

In [38]:
# List of unique bigrams (deduping bigramsAsList)
bigramsListTrain= list(set((trainBigramsAsList)))
len(bigramsListTrain)

9744

In [39]:
# Dictionary with how many docs the bigrams appear in within training set
bigramDict_train={}
bigramDict_train=bigramDict_train.fromkeys(bigramsListTrain,0)

for bigram in bigramsListTrain:
    for cell in list_of_train_bigrams:
        if bigram in cell:
            bigramDict_train[bigram] = bigramDict_train[bigram]+1        

In [40]:
# Removing bigrams that appear in less than 10 docs
finalTrainBigramDict = {k:v for k, v in bigramDict_train.items() if v>10}
finalTrainBigramsList = finalTrainBigramDict.keys() 

In [41]:
# Get length of bigrams list
len(finalTrainBigramsList)

76

### [Prepare Test Set]

In [42]:
# Pull values from 'Review' column
test_SSsubset_reviewslist = test_SSsubset['reviews'].values

In [43]:
# Removes words that are in less than 5% and more than 98% of all docs
test_vect = CountVectorizer(min_df=0.05, max_df=0.98, stop_words='english')

In [44]:
# Creating count matrix for testing set
test_CountVect = test_vect.fit_transform(test_SSsubset_reviewslist)

In [45]:
# Converts count matrix into count dataframe
test_count_df = pd.SparseDataFrame([pd.SparseSeries(test_CountVect[i].toarray().ravel()) 
                                    for i in np.arange(test_CountVect.shape[0])], 
                                   columns = test_vect.get_feature_names())

In [46]:
# Get counts of bigrams in each document
test_binary_df = test_count_df.applymap(createBinaryTerms)

In [47]:
# Dictionary of how many documents each term occurs in
test_count_dict = (test_binary_df.apply(sum)).to_dict()
testunigramsList = test_count_dict.keys()

In [48]:
# Get frequency of all ref. pos. and ref. neg words
test_vectAll = CountVectorizer(min_df=0, stop_words='english')

In [49]:
# Running code with all words (not removing words based on frequency)
test_CountVectAll = test_vectAll.fit_transform(test_SSsubset_reviewslist)

In [50]:
# Create sparse dataframe using vectorizer
test_count_df_all = pd.SparseDataFrame([pd.SparseSeries(test_CountVectAll[i].toarray().ravel()) 
                                        for i in np.arange(test_CountVectAll.shape[0])], 
                                       columns = test_vectAll.get_feature_names())

In [51]:
# Create a dictionary with all term counts
test_binary_df_all = test_count_df_all.applymap(createBinaryTerms)
test_counts_dict_all = (test_binary_df_all.apply(sum)).to_dict()

In [52]:
# Creates list of positive and negative reference words that appear in the test set
posMatches_test = list(set(test_counts_dict_all.keys()).intersection(set(posMatches)))
negMatches_test = list(set(test_counts_dict_all.keys()).intersection(set(negMatches)))

In [53]:
# Dictionary of how many docs the reference negative and reference positive words appear in 
# Only within test set (vs whole corpus)
refWordsDictTest = {}
for x in test_counts_dict_all.keys():
    if x in posMatches_test:
        refWordsDictTest[x] = test_counts_dict_all[x]
    elif x in negMatches_test:
        refWordsDictTest[x] = test_counts_dict_all[x]

In [54]:
# Creates new column with tokenized and tagged review
test_SSsubset['TaggedReview'] = test_SSsubset['reviews'].apply(tag_review)
list_of_test_bigrams = test_SSsubset['TaggedReview'].apply(get_bigrams)
list_of_test_bigrams

0      [(best, kobe), (very, trendy), (awesome, servi...
1                                                     []
2      [(awesome, experience), (friendly, staff), (st...
3      [(breakfast, time), (entire, menu), (carne, as...
4                                                     []
5      [(damn, good), (good, burger), (very, nice), (...
6      [(so, glad), (really, great), (really, beautif...
7      [(first, time), (brown, lettuce), (red, lobste...
8      [(countless, times), (urban, tiki), (simple, t...
9      [(crazy, jim), (other, night), (very, good), (...
10     [(different, locations), (new, york), (high, e...
11     [(few, years), (local, i), (t, fantastic), (fo...
12     [(completely, non-descript), (non-descript, be...
13     [(greek, spot), (so, worth), (f*, %), (hummus,...
14     [(i, chalk), (notable, features), (near, future)]
15     [(only, sign), (additional, parking), (casual,...
16                                        [(very, good)]
17     [(so, close), (good, thi

In [55]:
# Turning series of lists of bigrams into one list
testBigramsAsList = []
def addBigrams(bigramList):
    for i in bigramList:
        testBigramsAsList.append(i)
list_of_test_bigrams.map(addBigrams)
testBigramsAsList

[('best', 'kobe'),
 ('very', 'trendy'),
 ('awesome', 'service'),
 ('awesome', 'experience'),
 ('friendly', 'staff'),
 ('staff', 'great'),
 ('great', 'value'),
 ('quick', 'service'),
 ('fantastically', 'nummy'),
 ('breakfast', 'time'),
 ('entire', 'menu'),
 ('carne', 'asada'),
 ('chicken', 'soft'),
 ('soft', 'taco'),
 ('pretty', 'tasteless'),
 ('shredded', 'beef'),
 ('iceberg', 'lettuce'),
 ('not', 'super'),
 ('super', 'interesting'),
 ('hot', 'sauce'),
 ('enough', 'umph'),
 ('damn', 'good'),
 ('good', 'burger'),
 ('very', 'nice'),
 ('nice', 'place'),
 ('so', 'glad'),
 ('really', 'great'),
 ('really', 'beautiful'),
 ('open', 'spaces'),
 ('open', 'kitchens'),
 ('small', 'touches'),
 ('large', 'community'),
 ('great', 'atmosphere'),
 ('communal', 'dining'),
 ('first', 'time'),
 ('so', 'good'),
 ('favorite', 'appetizers'),
 ('dinner', 'next'),
 ('next', 'time'),
 ('time', 'i'),
 ('i', 'm'),
 ('flavor', 'dumplings'),
 ('perfect', 'i'),
 ('little', 'more'),
 ('more', 'chicken'),
 ('warmer', 

In [56]:
#list of unique bigrams (deduping bigramsAsList)
bigramsListTest= list(set((testBigramsAsList)))
len(bigramsListTest)

5581

In [57]:
# Dictionary with how many docs the bigrams appear in within test set
bigramDict_test={}
bigramDict_test=bigramDict_test.fromkeys(bigramsListTest,0)


for bigram in bigramsListTest:
    for cell in list_of_test_bigrams:
        if bigram in cell:
            bigramDict_test[bigram] = bigramDict_test[bigram]+1  

In [58]:
# Removing bigrams that appear in less than 10 docs
finalTestBigramDict = {k:v for k, v in bigramDict_test.items() if v>10}
finalTestBigramsList = finalTestBigramDict.keys()  

In [59]:
# Get length of bigrams list
len(finalTestBigramsList)

26

### [Transform Train Data for PMI Analysis]

In [60]:
# Make unigrams list into a series
unigramSeries_train = pd.Series(train_unigramsList)

In [61]:
# Turn training bigrams into list of strings to use in co-occurance counter
bigram_str_list_train = []

for i in range(0,len(finalTrainBigramsList)):
    s = str(finalTrainBigramsList[i][0]) +" "+ str(finalTrainBigramsList[i][1])
    bigram_str_list_train.append(s)

# Remove duplicate bigrams
bigram_str_list_train = list(set(bigram_str_list_train))
print "Number of training bigrams", len(bigram_str_list_train)

Number of training bigrams 76


In [62]:
# Update once ready for full lists
check_words_train = posMatches_train + negMatches_train
reviews_list_train = (train_SSsubset_reviewslist).tolist()

#Unigrams and bigrams to find co-occurances with the sentiment words
uniBiSeries_train = unigramSeries_train.append(pd.Series(bigram_str_list_train))

### [Co-Occurence]
The below blocks deals with finding the proximity-based co-occurrences of either bigrams or unigrams that we provide to the function. This is essentially needed for the joint probability portion of the PMI calculation of ngrams and the reference positive or negatives words that we also provide to the function. As you can see in the function definition parameters, we are declaring a proximity distance of 3. This means that we'll be checking for whether or not a reference word is within 3 words or tokens of the chosen ngram (bigram or unigram). The result of this function and the following blocks is to create a dictionary of dictionaries where the keys are the ngrams and the values are nested dictionaries for each ngram with reference word keys and the counts of the times where the reference word was in the correct proximity of that ngram. That result is then used for the PMI calculations later on in the code. 

Our methodology for this function, **coocur**, uses the following logical steps: 
We bring in the different parameters: a reference word list, a review list, and a list of ngrams (unigrams or bigrams). 

We then create a dictionary for the reference words and set them all the 0. This is used later on for the nested dictionary. 

The first major for-loop basically goes through the given review in the list of reviews, finds where the ngrams are, and puts them into a tuple list for further processing.

The second for-loop does the proximity analysis using the tokenized review and the ngrams list. It searches for a range around the ngram in the review, and then assesses whether or not each word in the reference list is within that proximity. If a reference word is in the range, the value for the word in the dictionary is incremented by 1. 

This process across all the reviews, ngrams, and reference words results in the nested dictionary where each (unigram or bigram) is the key, and the nested dictionary is the result of this **coocur** function.

In [63]:
def checkReview (review, w, ref_words, correct_tuples_list, dist):
    freq_count = {}
    
    w = w.lower()
    review = review.lower()
    review_formatted = smart_str((review))
    split_rev = word_tokenize(review_formatted)           # List of all words in a review (to use in indexing)
    
    w_list = []
    for i in range(0,len(w.split())):
        w_list.append(smart_str(w.split()[i]))            # If a bigram, splits into a list. Unigrams are just a list of 1
    
    indices1 = np.where(np.array(split_rev) == w_list[0])[0]
    
    if len(w_list) > 1:                                   # If it is a bigram
        indices2 = np.where(np.array(split_rev) == w_list[1])[0]
    else:
        indices2 = indices1                               # unigrams will be checked left and right of the same index
        
    for idx2_spot in indices2:
        for idx1_spot in indices1:
            if (idx2_spot - idx1_spot) <2 :               # complete the same task for 1 (bigrams) or 0 (unigrams)
                tuple_indices = (idx1_spot, idx2_spot)
                correct_tuples_list.append(tuple_indices)
        
    for a, b in correct_tuples_list: 
        if (a-dist) < 0:
            start = 0                          # if distance is too short, start at 0
        else:
            start = a-dist
        if (b+dist) > len(split_rev):
            end = len(split_rev)-1             # if distance is too long, end at the last index
        else:
            end = b + dist
        for j in ref_words:
            if j in split_rev[start:end+1]:    # check if the sentiment word is in the range
                freq_count[j] = 1
            else:
                freq_count[j] = 0
    return freq_count

In [64]:
def coocur (w, ref_words = check_words_train, reviews= reviews_list_train, dist = 3):
    w_coocur = dict.fromkeys(ref_words, 0)
    correct_tuples_list = []                  # Empty list to store tuple locations of bigrams/unigrams within review
    
    freq_list = map(functools32.partial(checkReview, w=w, ref_words = ref_words, \
                                      correct_tuples_list = correct_tuples_list, dist = dist), reviews)
        
    for item in range(0,len(freq_list)):
        for k in freq_list[item].keys():
            w_coocur[k] = w_coocur[k] + freq_list[item][k]
        
    return w_coocur

This cell below maps the above function onto a list of our ngrams. It basically replaces the ngrams in the series with the returned dictionary from the **coocur** function. 

This is the final piece of this proximity analysis. This final block below creates the nested dictionary needed for the PMI analysis. It basically connects the original list of ngrams we used for the **coocur** function with its dictionary of counts with the reference words. 

In [65]:
# Create a list of dictionaries for each unigram/bigram feature
list_dict_train = uniBiSeries_train.map(coocur)

KeyboardInterrupt: 

In [None]:
# Turn unigram/bigram Series into a list
uniBiList_train = uniBiSeries_train.tolist()

# Attach the unigram/bigram list as the keys for the list of dictionaries
count_both_train = dict(zip(uniBiList_train,list_dict_train))
count_both_train

In [None]:
# Function to get bigram strings by review
def join_review_bigrams(review_bigrams):
    review_bigram_str_list = []
    for i in range(0,len(review_bigrams)):
        s = review_bigrams[i][0] +" "+ review_bigrams[i][1]
        review_bigram_str_list.append(s.encode('utf-8'))
    return review_bigram_str_list

In [None]:
# Get list of bigram and unigram strings in each review
train_SSsubset['ReviewBigrams'] = list_of_train_bigrams   # Add column of bigram tuples in each review
train_SSsubset['ReviewBigramStrings'] = train_SSsubset['ReviewBigrams'].map(join_review_bigrams) # Get list of bigram strings in each review
train_SSsubset['ReviewUnigramStrings'] = train_SSsubset['reviews'].apply(lambda t: t.split()) # Get list of unigram strings in each review
train_SSsubset['ReviewStrings'] = train_SSsubset['ReviewBigramStrings'] + (train_SSsubset['ReviewUnigramStrings']) # Combine list of bigram and unigram strings in each review

In [None]:
# Update bigram dict to use strings
finalTrainBigramStringDict = defaultdict(lambda : defaultdict(int))
finalTrainBigramsStrings = join_review_bigrams(finalTrainBigramsList)
for i in range(0, len(finalTrainBigramsStrings)):
    finalTrainBigramStringDict[finalTrainBigramsStrings[i].lower()] = finalTrainBigramDict[finalTrainBigramsList[i]]

In [None]:
# Create full lookup term dictionary (unigrams + bigrams)
train_lookuptermDict = defaultdict(lambda : defaultdict(int))
train_lookuptermDict = train_counts_dict
train_lookuptermDict.update(finalTrainBigramStringDict)
train_lookuptermDict
train_lookupterms = train_lookuptermDict.keys()

### [Transform Test Data for PMI Analysis]

In [None]:
# Make unigrams list into a series
unigramSeries_test = pd.Series(testunigramsList)

In [None]:
# Turn training bigrams into list of strings to use in co-occurance counter
bigram_str_list_test = []

for i in range(0,len(finalTestBigramsList)):
    s = str(finalTestBigramsList[i][0]) +" "+ str(finalTestBigramsList[i][1])
    bigram_str_list_test.append(s)

# Remove duplicate bigrams
bigram_str_list_test = list(set(bigram_str_list_test))
len(bigram_str_list_test)

In [None]:
# Update once ready for full lists
check_words_test = posMatches_test + negMatches_test
reviews_list_test = (test_SSsubset_reviewslist).tolist()

#Unigrams and bigrams to find co-occurances with the sentiment words
uniBiSeries_test = unigramSeries_test.append(pd.Series(bigram_str_list_test))

In [None]:
def coocur_test (w, ref_words = check_words_test, reviews= reviews_list_test, dist = 3):
    w_coocur = dict.fromkeys(ref_words, 0)
    correct_tuples_list = []                  # Empty list to store tuple locations of bigrams/unigrams within review
    
    freq_list = map(functools32.partial(checkReview, w=w, ref_words = ref_words, \
                                      correct_tuples_list = correct_tuples_list, dist = dist), reviews)
        
    for item in range(0,len(freq_list)):
        for k in freq_list[item].keys():
            w_coocur[k] = w_coocur[k] + freq_list[item][k]
        
    return w_coocur

In [None]:
# Create a list of dictionaries for each unigram/bigram feature
list_dict_test = uniBiSeries_test.map(coocur_test)

In [None]:
# Turn unigram/bigram Series into a list
uniBiList_test = uniBiSeries_test.tolist()

# Attach the unigram/bigram list as the keys for the list of dictionaries
count_both_test = dict(zip(uniBiList_test,list_dict_test))
count_both_test

In [None]:
# Get list of bigram and unigram strings in each review
test_SSsubset['ReviewBigrams'] = list_of_train_bigrams   # Add column of bigram tuples in each review
test_SSsubset['ReviewBigramStrings'] = test_SSsubset['ReviewBigrams'].map(join_review_bigrams) # Get list of bigram strings in each review
test_SSsubset['ReviewUnigramStrings'] = test_SSsubset['reviews'].apply(lambda t: t.split()) # Get list of unigram strings in each review
test_SSsubset['ReviewStrings'] = test_SSsubset['ReviewBigramStrings'] + (test_SSsubset['ReviewUnigramStrings']) # Combine list of bigram and unigram strings in each review

In [None]:
# Update bigram dict to use strings
finalTestBigramStringDict = defaultdict(lambda : defaultdict(int))
finalTestBigramsStrings = join_review_bigrams(finalTestBigramsList)
for i in range(0, len(finalTestBigramsStrings)):
    finalTtestBigramStringDict[finalTestBigramsStrings[i].lower()] = finalTestBigramDict[finalTestBigramsList[i]]

### [Train PMI Calculations]

In [None]:
# Before calculating PMI, need inputs

# Input 1
# Total number of documents, used to calculate probability
train_tot_n_docs = train_SSsubset.shape[0]

# Input 2
# List of lookup terms, including unigrams and bi-grams
train_lookupterm_termlist = train_lookupterms

# Input 3
# Dictionary of lookup term document counts
train_lookupterm_dict = train_lookuptermDict

# Input 4
# List of positive sentiment terms
train_pos_sentiment_termlist = posMatches_train

# Input 5
# List of negative sentiment terms
train_neg_sentiment_termlist = negMatches_train

# Input 6
# Dictionary of sentiment term document counts
train_sentimentterm_dict = refWordsDictTrain

# Input 7
# Dictionary of cooccurance for each lookup term (unigrams and bigrams) to sentiment terms (positive and negative)
# Format: {LT1: {ST1: freq, ST2: freq, ST3: freq, ST4: freq}, LT2: {ST1: freq, ST2: freq, ST3: freq, ST4: freq}}
train_cooccur_dict = count_both_train

In [None]:
# This function calculates the PMI between two terms
# Inputs are term counts, joint counts, and total number of documents
def train_pmi_calc(term, sentimentword):
    prob_term = (train_lookupterm_dict[term] * 1.0) / train_tot_n_docs
    prob_sentiment = (train_sentimentterm_dict[sentimentword] * 1.0) / train_tot_n_docs
    prob_both = (train_cooccur_dict[term][sentimentword] * 1.0) / train_tot_n_docs
    pmi_pair = log2(prob_both / ((prob_term * prob_sentiment) + 0.001))
    return pmi_pair

In [None]:
# This function calculates the PMI scores for terms in a given list, 
#      compared against a list of positive sentiment terms and a list of negative sentiment terms
# This function references the pmi_calc function previously defined

so_scores = {}
def train_pmi(term_list, pos_sentiment_list, neg_sentiment_list):
    so_scores = defaultdict(lambda : defaultdict(int))
    for t in range(len(term_list)):
        pos_so = []
        neg_so = []
        for p in range(len(pos_sentiment_list)):
            pos_so.append(train_pmi_calc(term_list[t], pos_sentiment_list[p]) * 1.0)
        for n in range(len(neg_sentiment_list)):
            neg_so.append(train_pmi_calc(term_list[t], neg_sentiment_list[n]) * 1.0)
        pos_so_avg = mean(pos_so)
        neg_so_avg = mean(neg_so)
        so_scores[term_list[t]] = (pos_so_avg - neg_so_avg)
    return so_scores

In [None]:
# Generate PMI scores for our dataset
train_pmi_score_lookup = train_pmi(train_lookupterm_termlist, 
                                   train_pos_sentiment_termlist, 
                                   train_neg_sentiment_termlist)

# # Output
# train_pmi_score_lookup

In [None]:
# For a given list of terms (e.g., relevant terms from a given review), calculate overall PMI
# Overall PMI is considered the average PMI of each of the terms

def train_review_pmi(review_terms):
    review_pmi_scores = []
    for i in range(len(review_terms)):
        if review_terms[i] in train_pmi_score_lookup:
            review_pmi_scores.append(train_pmi_score_lookup[review_terms[i]])
    avg_pmi_score = mean(review_pmi_scores)
    return avg_pmi_score

In [None]:
# Map review_pmi function to datasubset

train_SSsubset['ReviewPMI'] = train_SSsubset['ReviewStrings'].map(review_pmi)

# # View Output
# train_SSsubset['ReviewPMI']

### [Test PMI Calculations]

In [None]:
# Before calculating PMI, need inputs

# Input 1
# Total number of documents, used to calculate probability
test_tot_n_docs = test_SSsubset.shape[0]

# Input 2
# List of lookup terms, including unigrams and bi-grams
test_lookupterm_termlist = test_lookupterms

# Input 3
# Dictionary of lookup term document counts
test_lookupterm_dict = test_lookuptermDict

# Input 4
# List of positive sentiment terms
test_pos_sentiment_termlist = posMatches_test

# Input 5
# List of negative sentiment terms
test_neg_sentiment_termlist = negMatches_test

# Input 6
# Dictionary of sentiment term document counts
test_sentimentterm_dict = refWordsDictTest

# Input 7
# Dictionary of cooccurance for each lookup term (unigrams and bigrams) to sentiment terms (positive and negative)
# Format: {LT1: {ST1: freq, ST2: freq, ST3: freq, ST4: freq}, LT2: {ST1: freq, ST2: freq, ST3: freq, ST4: freq}}
test_cooccur_dict = count_both_test

In [None]:
# This function calculates the PMI between two terms
# Inputs are term counts, joint counts, and total number of documents
def test_pmi_calc(term, sentimentword):
    prob_term = (test_lookupterm_dict[term] * 1.0) / test_tot_n_docs
    prob_sentiment = (test_sentimentterm_dict[sentimentword] * 1.0) / test_tot_n_docs
    prob_both = (test_cooccur_dict[term][sentimentword] * 1.0) / test_tot_n_docs
    pmi_pair = log2(prob_both / ((prob_term * prob_sentiment) + 0.001))
    return pmi_pair

In [None]:
# This function calculates the PMI scores for terms in a given list, 
#      compared against a list of positive sentiment terms and a list of negative sentiment terms
# This function references the pmi_calc function previously defined

so_scores = {}
def test_pmi(term_list, pos_sentiment_list, neg_sentiment_list):
    so_scores = defaultdict(lambda : defaultdict(int))
    for t in range(len(term_list)):
        pos_so = []
        neg_so = []
        for p in range(len(pos_sentiment_list)):
            pos_so.append(test_pmi_calc(term_list[t], pos_sentiment_list[p]) * 1.0)
        for n in range(len(neg_sentiment_list)):
            neg_so.append(test_pmi_calc(term_list[t], neg_sentiment_list[n]) * 1.0)
        pos_so_avg = mean(pos_so)
        neg_so_avg = mean(neg_so)
        so_scores[term_list[t]] = (pos_so_avg - neg_so_avg)
    return so_scores

In [None]:
# Generate PMI scores for our dataset
test_pmi_score_lookup = testn_pmi(test_lookupterm_termlist,
                                  test_pos_sentiment_termlist,
                                  test_neg_sentiment_termlist)

# # Output
# test_pmi_score_lookup

In [None]:
# For a given list of terms (e.g., relevant terms from a given review), calculate overall PMI
# Overall PMI is considered the average PMI of each of the terms

def test_review_pmi(review_terms):
    review_pmi_scores = []
    for i in range(len(review_terms)):
        if review_terms[i] in test_pmi_score_lookup:
            review_pmi_scores.append(test_pmi_score_lookup[review_terms[i]])
    avg_pmi_score = mean(review_pmi_scores)
    return avg_pmi_score

In [None]:
# Map review_pmi function to datasubset

test_SSsubset['ReviewPMI'] = test_SSsubset['ReviewStrings'].map(test_review_pmi)

# # View Output
# test_SSsubset['ReviewPMI']

### [Model]

In [None]:
# Create matrices for test and train sets
y_train_PMI,X_train_PMI = dmatrices('target~0+ReviewPMI', train_SSsubset)
y_test_PMI,X_test_PMI = dmatrices('target~0+ReviewPMI', test_SSsubset)

In [None]:
# Train Logistic Regression model and predict on test set
LRclassifier_PMI = LogisticRegression().fit(X_train_PMI,y_train_PMI)
predictions_PMI = LRclassifier_PMI.predict(X_test_PMI)

In [None]:
# Print the confusion matrix for Logistic Regression on PMI data
confusion_matrix_PMI = metrics.confusion_matrix(y_test_PMI, predictions_PMI)
print confusion_matrix_PMI

In [None]:
# Print the accuracy score for Logistic Regression on PMI data
accuracy_PMI = metrics.accuracy_score(y_test_PMI, predictions_PMI)
print accuracy_PMI

## | Task E - Results |
This was a multi-part problem. We had to: (a) figure out which unigrams and bigrams we would test against the reference words when calculating the PMI of the review, (b) calculate the marginal probabilities of the n-grams and the reference words, (c) set up a proximity factor within the PMI algorithm, (d) calculate the joint probability of the combinations of n-grams/reference words, (e) using the PMIs within a review to calculate the average sentiment score for each review, and (f) using the sentiment scores to classify the ratings. 

When choosing the unigrams to test, we decided to use words which appeared in at least 3 % of documents. We then used a count vectorizer to create a dictionary storing the unigrams as keys and the number of documents in which that unigram occurs as values. In creating bigrams, we used three POS tagging patterns (noun+adj, adv+adj, and adj+noun). This resulted in over 27,000 bigrams, most of which, we found, only occured in 1 or 2 documents. Therefore, we limited our bigrams to only those which occurred in more than 10 documents (out of our training set). We created a similar dictionary of counts to track the marginal probability of these bigrams. We repeated a similar process for reference words, using a lexicon of "positive" and "negative" words from the Internet (only keeping the words which actually appeared in our corpus). 

After obtaining marginal probabilities of individual n-grams and reference words, we built a function to get joint probability of the combinations of n-grams with reference words (more detail on this above by the function). The resulting nested dictionary is used for the PMI analysis. 

For calculating the PMI, we calculated averaged the PMI scores for positive reference words and for negative reference words for each n-gram term.  This was done to cancel out any bias from having more positive reference words than negative reference words.  Then, we subtracted the negative PMI average from the positive PMI average to get an overall PMI score on sentiment for the specific word.  Finally, we pulled PMI scores for relevant n-grams on each review and averaged those to get a PMI score for the review as a whole.  These scores were then used to predict rating using logistic regression.


---
# Task F. 

### What are the top 5 “attributes” of a restaurant that are associated with (i) high and (ii) low ratings? That is, when people rate a restaurant high or low, are they more likely to mention service, ambiance, etc.? 

---

In [None]:
# Transform X's without stop words
X_F = vectorizer_stop.fit_transform(X_B.values)
count_df = pd.SparseDataFrame([pd.SparseSeries(X_F[i].toarray().ravel()) 
                               for i in np.arange(X_F.shape[0])],
                              columns = vectorizer_stop.get_feature_names())

In [None]:
count_df['rating']=data_sample['target']

In [None]:
# Split dataframe between high and low
High_df=count_df[count_df['rating']==1]
Low_df=count_df[count_df['rating']==0]

In [None]:
# Get top 5 'Attributes' associated with High Ratings
df_HighRating=pd.DataFrame(High_df.apply(sum))
Top5High = df_HighRating.sort(ascending=False, columns=0)[:5]
print 'Top 5 Attributes Associated with High Ratings:'
print Top5High

In [None]:
# Get top 5 'Attributes' associated with Low Ratings
df_LowRating=pd.DataFrame(Low_df.apply(sum))
Top5Low = df_LowRating.sort(ascending=False, columns=0)[:5]
print 'Top 5 Attributes Associated with Low Ratings:'
print Top5Low

## | Task F - Results |
People that rated the restaurants with a high rating mentioned words like 'rating','food', 'good', 'place', and 'great'.  Reviews that gave a low rating mentioned the same attributes, but we can see that positive adjectives were mentioned much less frequently. 'Food' and 'place' seem to be top concerns for both parties. There is obviously lots of overlap of words used in good and bad restaurant reviews, as there should be, but context is key in determining sentiment. 