In [41]:
import pandas as pd
import numpy as np
from pandas import Series
from pandas import DataFrame
from patsy import dmatrices
from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import neighbors
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
#from sklearn.cross_validation import 
#from sklearn.cross_validation import 
import scipy.sparse
import sklearn.cluster
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
import itertools

In [6]:
yelp = pd.read_csv("yelp.csv")

In [8]:
high_mask = yelp['stars'] > 3
yelp['High'] = 0
yelp.ix[high_mask, 'High'] = 1

#Task A

In [4]:
formula = 'High ~ 0 + votes_cool + votes_funny + votes_useful + Cheap + Moderate + Expensive  ' + \
' + VeryExpensive + American + Chinese + French + Japanese + Indian + Italian + Greek ' + \
' + Mediterranean + Mexican + Thai + Vietnamese + Others'

In [5]:
Y, X = dmatrices(formula, yelp, return_type='dataframe')
y = Y['High'].values

In [6]:
index = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.3, train_size = 0.7)

In [7]:
index

StratifiedShuffleSplit(labels=[ 0.  0.  1. ...,  1.  0.  1.], n_iter=1, test_size=0.3, random_state=None)

In [8]:
DF_X = DataFrame(X)

In [9]:
for train_index, test_index in index:
    print(train_index)
    X_train, X_test = DF_X.iloc[train_index,], DF_X.iloc[test_index,]
    y_train, y_test = y[train_index], y[test_index]

[15237  6989 16386 ...,  5757 18356   478]


In [10]:
X_train.head()

Unnamed: 0,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,French,Japanese,Indian,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others
15237,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
6989,2,1,4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
16386,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1614,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9218,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0


###Logistic Regression

In [11]:
logistic_model = LogisticRegression()
logistic_result = logistic_model.fit(X_train, y_train)

In [12]:
logistic_train_prediction = logistic_model.predict(X_train)
print metrics.accuracy_score(y_train, logistic_train_prediction)

0.683977426959


In [13]:
logistic_test_prediction = logistic_model.predict(X_test)
print metrics.accuracy_score(y_test, logistic_test_prediction)

0.684


###KNN

In [14]:
knn_model = neighbors.KNeighborsClassifier(n_neighbors=15, weights='uniform', p=2)
knn_result = knn_model.fit(X_train, y_train)

In [15]:
knn_train_prediction = knn_model.predict(X_train)
print metrics.accuracy_score(y_train, knn_train_prediction)

0.695406814772


In [16]:
knn_test_prediction = knn_model.predict(X_test)
print metrics.accuracy_score(y_test, knn_test_prediction)

0.6815


#Task B: Classification on Text

###Random Sampling

In [17]:
np.random.seed(1234567)
train = yelp.sample(int(len(yelp)*0.7), replace=False)

In [18]:
test = yelp[~yelp.index.isin(train.index.values)]

In [19]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [20]:
vectorizer = TfidfVectorizer(min_df=0,smooth_idf=True, strip_accents='unicode', norm='l2')

In [21]:
def text_classification (v):
    X_transform=v.fit_transform(train_x)
    X_test=v.transform(test_x)
    
    nb_classifier = MultinomialNB().fit(X_transform, train_y)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [22]:
text_classification(vectorizer)

Percent Correct
69.617

Confusion Matrix
Predicted   0     1
Actual             
0          86  1821
1           2  4091

Proportion Table
Predicted         0         1
Actual                       
0          0.045097  0.954903
1          0.000489  0.999511


###Undersampling data set to ensure 50/50 split

In [23]:
highs=yelp[yelp['High']==1]
lows=yelp[yelp['High']==0]

In [24]:
sample_high=highs.sample(len(lows),replace=False).copy()

In [25]:
sample=sample_high.append(lows, ignore_index=True)

In [26]:
train=sample.sample(int(0.7*len(sample)),replace=False).copy()
test=sample[~sample.index.isin(train.index.values)].copy()

In [27]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [28]:
text_classification(vectorizer)

Percent Correct
81.334

Confusion Matrix
Predicted     0     1
Actual               
0          1652   280
1           442  1494

Proportion Table
Predicted         0         1
Actual                       
0          0.855072  0.144928
1          0.228306  0.771694


#Task C

In [29]:
X_transform=vectorizer.fit_transform(train_x) #just the reviews
sam=train.drop(["Review", "stars"], axis=1).copy()
samsparse=scipy.sparse.csr_matrix(sam.to_sparse()) #sparsing the numeric
surprise=scipy.sparse.hstack([X_transform, samsparse]) #combining numeric and text

In [30]:
X_test=vectorizer.transform(test_x) #repeating above for test
samt=test.drop(["Review", "stars"], axis=1).copy() #removing irrelevant
samsparset=scipy.sparse.csr_matrix(samt.to_sparse()) #sparsing numeric 
surpriset=scipy.sparse.hstack([X_test, samsparset]) #combining numeric and text

In [31]:
nb_classifier = MultinomialNB().fit(surprise, train_y)
y_nb_predicted = nb_classifier.predict(surpriset)
    
predict_y=Series(y_nb_predicted).reset_index()[0]
df=pd.DataFrame()
df['Predicted']=predict_y
df['Actual']=test_y.reset_index()['High']

In [32]:
print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

Percent Correct
97.854

Confusion Matrix
Predicted     0     1
Actual               
0          1849    83
1             0  1936

Proportion Table
Predicted         0         1
Actual                       
0          0.957039  0.042961
1          0.000000  1.000000


In [33]:
y_nb_predicted = nb_classifier.predict(surprise)
predict_y=Series(y_nb_predicted).reset_index()[0]
df=pd.DataFrame()
df['Predicted']=predict_y
df['Actual']=train_y.reset_index()['High']
print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

Percent Correct
98.692

Confusion Matrix
Predicted     0     1
Actual               
0          4396   118
1             0  4510

Proportion Table
Predicted         0         1
Actual                       
0          0.973859  0.026141
1          0.000000  1.000000


#Cross Validation (because this is too good to be true)

In [34]:
def yelper(removeList,sample=sample):
    train=sample.sample(int(0.7*len(sample)),replace=False).copy()
    test=sample[~sample.index.isin(train.index.values)].copy()
    
    train_x = train['Review']
    train_y = train['High']
    test_x = test['Review']
    test_y = test['High']
    
    X_transform=vectorizer.fit_transform(train_x) #just the reviews
    sam=train.drop(removeList, axis=1).copy()
    samsparse=scipy.sparse.csr_matrix(sam.to_sparse()) #sparsing the numeric
    surprise=scipy.sparse.hstack([X_transform, samsparse]) #combining numeric and text
    
    X_test=vectorizer.transform(test_x) #repeating above for test
    samt=test.drop(removeList, axis=1).copy() #removing irrelevant
    samsparset=scipy.sparse.csr_matrix(samt.to_sparse()) #sparsing numeric 
    surpriset=scipy.sparse.hstack([X_test, samsparset]) #combining numeric and text
    
    nb_classifier = MultinomialNB().fit(surprise, train_y)
    y_nb_predicted = nb_classifier.predict(surpriset)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [35]:
rL=["Review","stars","High"]
yelper(rL)

Percent Correct
79.421

Confusion Matrix
Predicted     0     1
Actual               
0          1548   353
1           443  1524

Proportion Table
Predicted         0         1
Actual                       
0          0.814308  0.185692
1          0.225216  0.774784


In [36]:
skf = StratifiedKFold(sample['High'], n_folds=3, random_state=45)

In [37]:
np.random.seed(12)

In [38]:
yelper(rL)

Percent Correct
80.688

Confusion Matrix
Predicted     0     1
Actual               
0          1596   313
1           434  1525

Proportion Table
Predicted         0         1
Actual                       
0          0.836040  0.163960
1          0.221542  0.778458


In [39]:
yelper(rL)

Percent Correct
80.093

Confusion Matrix
Predicted     0     1
Actual               
0          1577   390
1           380  1521

Proportion Table
Predicted         0         1
Actual                       
0          0.801729  0.198271
1          0.199895  0.800105


In [40]:
yelper(rL)

Percent Correct
80.222

Confusion Matrix
Predicted     0     1
Actual               
0          1588   382
1           383  1515

Proportion Table
Predicted         0         1
Actual                       
0          0.806091  0.193909
1          0.201791  0.798209


##Part D

In [16]:
rawsentiment = pd.read_excel('Yelp_Review_Data_Results.xlsx')

In [17]:
rawsentiment['True_Sentiment'] = rawsentiment['Pos Senti'] + rawsentiment['Neg Senti']

In [18]:
rawsentiment['Actual'] = yelp['High']

##Raw Sentiment, 0 defaults to low

In [19]:
rawsentiment['Predicted']=0
rawsentiment['Predicted'].ix[rawsentiment['True_Sentiment']>0]=1
df=rawsentiment
print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

Percent Correct
58.471

Confusion Matrix
Predicted     0     1
Actual               
0          2354  4092
1          4213  9339

Proportion Table
Predicted         0         1
Actual                       
0          0.365188  0.634812
1          0.310877  0.689123


#Raw Sentiment, 0 defaults to high

In [20]:
rawsentiment['Predicted']=0
rawsentiment['Predicted'].ix[rawsentiment['True_Sentiment']<0]=1
df=rawsentiment
print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

Percent Correct
35.914

Confusion Matrix
Predicted      0     1
Actual                
0           5457   989
1          11827  1725

Proportion Table
Predicted         0         1
Actual                       
0          0.846572  0.153428
1          0.872713  0.127287


##Part E

In [44]:
DTMReviews = vectorizer.fit_transform(sample['Review'])


In [45]:
#from sklearn.metrics.pairwise import cosine_similarity
#def new_euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
#    return cosine_similarity(X,Y)

# monkey patch (ensure cosine dist function is used)
#from sklearn.cluster import k_means_k_means_.euclidean_distances
#k_means_.euclidean_distances = new_euclidean_distances 

In [46]:
Cluster = sklearn.cluster.KMeans(n_clusters=2, random_state = 1)
clusterout = Cluster.fit(DTMReviews)

In [47]:
series_clusters = Series(clusterout.labels_)

In [48]:
sample['Cluster'] = series_clusters

In [49]:
test = pd.concat([sample, series_clusters], axis=1)

In [50]:
sample.head()

Unnamed: 0,stars,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,...,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others,Review,High,Cluster
0,4,1,0,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,Perfectly complimenting their slightly funky ...,1,1
1,5,1,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,reminds me of back home in Chicago. Love the a...,1,1
2,5,0,0,4,1,0,0,0,0,0,...,1,0,0,0,0,0,0,I LOVE this pizza! It s the best I have had i...,1,1
3,4,0,0,2,0,1,0,0,1,0,...,0,0,0,1,0,0,0,This was an interesting dinner. First it was...,1,0
4,5,1,1,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,I love this place and go there like a regular....,1,1


In [51]:
print "Confusion Matrix \n", pd.crosstab(index=sample['High'],columns=sample['Cluster'])
print "\nPercent Correct\n", round((sample['High']!=sample['Cluster']).mean()*100,3)
print "\nProportion Table\n", pd.crosstab(index=sample['High'],columns=sample['Cluster']).apply(lambda r: r/r.sum(), axis=1)

Confusion Matrix 
Cluster     0     1
High               
0        3422  3024
1        2295  4151

Percent Correct
41.258

Proportion Table
Cluster         0         1
High                       
0        0.530872  0.469128
1        0.356035  0.643965


##Part F

In [23]:
#from nltk import download
#from textblob_aptagger import PerceptronTagger
#from textblob import Blobber

In [27]:
reviews=yelp['Review']
reviews=reviews.str.decode("utf-8")
reviews_high = reviews[yelp['High'] == 1].copy()
reviews_low = reviews[yelp['High'] == 0].copy()
reviews=list(reviews)
reviews[:2]

[u'This location is out of business. I drove by it on my way to Costco and it just has a giant for lease sign.',
 u'= = = = = = CLOSED = = = = = =This JB s location  with it s poor building layout  poor service & mediocre food finally folded.  Good riddance!     :-)']

In [40]:
token_high = reviews_high.map(word_tokenize)
token_low = reviews_low.map(word_tokenize)
token_high = list(token_high)
token_low = list(token_low)

In [44]:
flat_high = list(itertools.chain.from_iterable(token_high))
flat_low = list(itertools.chain.from_iterable(token_low))
high_lower = [t.lower() for t in flat_high if t.isalpha()]
low_lower = [t.lower() for t in flat_low if t.isalpha()]

In [49]:
high_set = set(high_lower)
low_set = set(low_lower)

In [53]:
high_vc = Series(high_lower).value_counts()
low_vc = Series(low_lower).value_counts()

In [61]:
high_clean = [word for word in high_vc.index if word not in stopwords.words('english')]
low_clean = [word for word in low_vc.index if word not in stopwords.words('english')]

In [67]:
tag_high100 = pos_tag(high_clean[:100])
tag_low100 = pos_tag(low_clean[:100])

In [73]:
high_noun = [word for word,tag in tag_high100 if tag == 'NN']
low_noun = [word for word,tag in tag_low100 if tag == 'NN']

In [None]:
def noun_extractor(series):
    token = series.map(word_tokenize)
    tag = token.map(pos_tag)
    [x for x in tag]