In [18]:
import pandas as pd
import numpy as np
from pandas import Series
from pandas import DataFrame
from patsy import dmatrices
from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import neighbors
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
#from sklearn.cross_validation import 
#from sklearn.cross_validation import 
import scipy.sparse
import sklearn.cluster

In [19]:
yelp = pd.read_csv("yelp.csv")

In [20]:
high_mask = yelp['stars'] > 3
yelp['High'] = 0
yelp.ix[high_mask, 'High'] = 1

#Task A

In [21]:
formula = 'High ~ 0 + votes_cool + votes_funny + votes_useful + Cheap + Moderate + Expensive  ' + \
' + VeryExpensive + American + Chinese + French + Japanese + Indian + Italian + Greek ' + \
' + Mediterranean + Mexican + Thai + Vietnamese + Others'

In [22]:
Y, X = dmatrices(formula, yelp, return_type='dataframe')
y = Y['High'].values

In [23]:
index = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.3, train_size = 0.7)

In [24]:
index

StratifiedShuffleSplit(labels=[ 0.  0.  1. ...,  1.  0.  1.], n_iter=1, test_size=0.3, random_state=None)

In [25]:
DF_X = DataFrame(X)

In [26]:
for train_index, test_index in index:
    print(train_index)
    X_train, X_test = DF_X.iloc[train_index,], DF_X.iloc[test_index,]
    y_train, y_test = y[train_index], y[test_index]

[14215 15825 14171 ...,  4988 13549  5476]


In [27]:
X_train.head()

Unnamed: 0,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,French,Japanese,Indian,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others
14215,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
15825,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
14171,1,1,3,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
12042,1,1,2,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7100,2,2,3,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0


###Logistic Regression

In [28]:
logistic_model = LogisticRegression()
logistic_result = logistic_model.fit(X_train, y_train)

In [29]:
logistic_train_prediction = logistic_model.predict(X_train)
print metrics.accuracy_score(y_train, logistic_train_prediction)

0.684691763697


In [30]:
logistic_test_prediction = logistic_model.predict(X_test)
print metrics.accuracy_score(y_test, logistic_test_prediction)

0.684


###KNN

In [31]:
knn_model = neighbors.KNeighborsClassifier(n_neighbors=15, weights='uniform', p=2)
knn_result = knn_model.fit(X_train, y_train)

In [32]:
knn_train_prediction = knn_model.predict(X_train)
print metrics.accuracy_score(y_train, knn_train_prediction)

0.67933423816


In [33]:
knn_test_prediction = knn_model.predict(X_test)
print metrics.accuracy_score(y_test, knn_test_prediction)

0.662333333333


#Task B: Classification on Text

###Random Sampling

In [34]:
np.random.seed(1234567)
train = yelp.sample(int(len(yelp)*0.7), replace=False)

In [35]:
test = yelp[~yelp.index.isin(train.index.values)]

In [36]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [37]:
vectorizer = TfidfVectorizer(min_df=0,smooth_idf=True, strip_accents='unicode', norm='l2')

In [38]:
def text_classification (v):
    X_transform=v.fit_transform(train_x)
    X_test=v.transform(test_x)
    
    nb_classifier = MultinomialNB().fit(X_transform, train_y)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [39]:
text_classification(vectorizer)

Percent Correct
68.033

Confusion Matrix
Predicted   0     1
Actual             
0          66  1917
1           1  4016

Proportion Table
Predicted         0         1
Actual                       
0          0.033283  0.966717
1          0.000249  0.999751


###Undersampling data set to ensure 50/50 split

In [40]:
highs=yelp[yelp['High']==1]
lows=yelp[yelp['High']==0]

In [41]:
sample_high=highs.sample(len(lows),replace=False).copy()

In [42]:
sample=sample_high.append(lows, ignore_index=True)

In [43]:
train=sample.sample(int(0.7*len(sample)),replace=False).copy()
test=sample[~sample.index.isin(train.index.values)].copy()

In [44]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [45]:
text_classification(vectorizer)

Percent Correct
81.593

Confusion Matrix
Predicted     0     1
Actual               
0          1686   254
1           458  1470

Proportion Table
Predicted         0         1
Actual                       
0          0.869072  0.130928
1          0.237552  0.762448


#Task C

In [46]:
X_transform=vectorizer.fit_transform(train_x) #just the reviews
sam=train.drop(["Review", "stars"], axis=1).copy()
samsparse=scipy.sparse.csr_matrix(sam.to_sparse()) #sparsing the numeric
surprise=scipy.sparse.hstack([X_transform, samsparse]) #combining numeric and text

In [47]:
X_test=vectorizer.transform(test_x) #repeating above for test
samt=test.drop(["Review", "stars"], axis=1).copy() #removing irrelevant
samsparset=scipy.sparse.csr_matrix(samt.to_sparse()) #sparsing numeric 
surpriset=scipy.sparse.hstack([X_test, samsparset]) #combining numeric and text

In [48]:
nb_classifier = MultinomialNB().fit(surprise, train_y)
y_nb_predicted = nb_classifier.predict(surpriset)
    
predict_y=Series(y_nb_predicted).reset_index()[0]
df=pd.DataFrame()
df['Predicted']=predict_y
df['Actual']=test_y.reset_index()['High']

In [49]:
print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

Percent Correct
98.035

Confusion Matrix
Predicted     0     1
Actual               
0          1864    76
1             0  1928

Proportion Table
Predicted         0         1
Actual                       
0          0.960825  0.039175
1          0.000000  1.000000


In [50]:
y_nb_predicted = nb_classifier.predict(surprise)
predict_y=Series(y_nb_predicted).reset_index()[0]
df=pd.DataFrame()
df['Predicted']=predict_y
df['Actual']=train_y.reset_index()['High']
print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

Percent Correct
98.881

Confusion Matrix
Predicted     0     1
Actual               
0          4405   101
1             0  4518

Proportion Table
Predicted         0         1
Actual                       
0          0.977585  0.022415
1          0.000000  1.000000


#Cross Validation (because this is too good to be true)

In [51]:
def yelper(removeList,sample=sample):
    train=sample.sample(int(0.7*len(sample)),replace=False).copy()
    test=sample[~sample.index.isin(train.index.values)].copy()
    
    train_x = train['Review']
    train_y = train['High']
    test_x = test['Review']
    test_y = test['High']
    
    X_transform=vectorizer.fit_transform(train_x) #just the reviews
    sam=train.drop(removeList, axis=1).copy()
    samsparse=scipy.sparse.csr_matrix(sam.to_sparse()) #sparsing the numeric
    surprise=scipy.sparse.hstack([X_transform, samsparse]) #combining numeric and text
    
    X_test=vectorizer.transform(test_x) #repeating above for test
    samt=test.drop(removeList, axis=1).copy() #removing irrelevant
    samsparset=scipy.sparse.csr_matrix(samt.to_sparse()) #sparsing numeric 
    surpriset=scipy.sparse.hstack([X_test, samsparset]) #combining numeric and text
    
    nb_classifier = MultinomialNB().fit(surprise, train_y)
    y_nb_predicted = nb_classifier.predict(surpriset)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [60]:
rL=["Review","stars","High"]
yelper(rL)

Percent Correct
79.524

Confusion Matrix
Predicted     0     1
Actual               
0          1541   398
1           394  1535

Proportion Table
Predicted         0         1
Actual                       
0          0.794740  0.205260
1          0.204251  0.795749


In [61]:
skf = StratifiedKFold(sample['High'], n_folds=3, random_state=45)

In [62]:
np.random.seed(12)

In [63]:
yelper(rL)

Percent Correct
80.222

Confusion Matrix
Predicted     0     1
Actual               
0          1572   344
1           421  1531

Proportion Table
Predicted         0         1
Actual                       
0          0.820459  0.179541
1          0.215676  0.784324


In [64]:
yelper(rL)

Percent Correct
80.119

Confusion Matrix
Predicted     0     1
Actual               
0          1565   426
1           343  1534

Proportion Table
Predicted         0         1
Actual                       
0          0.786037  0.213963
1          0.182738  0.817262


In [65]:
yelper(rL)

Percent Correct
80.3

Confusion Matrix
Predicted     0     1
Actual               
0          1572   369
1           393  1534

Proportion Table
Predicted         0         1
Actual                       
0          0.809892  0.190108
1          0.203944  0.796056


##Part D

In [66]:
rawsentiment = pd.read_excel(r'C:\Users\beins_000\Documents\GitHub\Text_Mining_MIS184N\Group_Assignment_2\Yelp_Review_Data_Results.xlsx')

In [67]:
rawsentiment.head(20)

Unnamed: 0,Review,Pos Senti,Neg Senti
0,CLOSED This JB s locati...,2,-2
1,This is just a basic (albeit mini) chain greas...,2,-2
2,Whenever I offer to take my mom out to lunch s...,4,-4
3,If I say it wasn t as bad as I was expecting i...,3,-3
4,I ve always said if the guacamole chips and s...,3,-3
5,Had the signature Black Chile entree. It was ...,3,-1
6,After hitting up the bank to sign some paper w...,4,-4
7,Great happy hour deals here! I loved the Cotij...,4,-5
8,Fine. Just fine. C /B- average-- all around. ...,4,-3
9,beautiful atmosphere...good prices (for the bi...,3,-1


In [68]:
rawsentiment['True_Sentiment'] = rawsentiment['Pos Senti'] + rawsentiment['Neg Senti']

##Part E

In [75]:
DTMReviews = vectorizer.fit_transform(sample['Review'])


KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [111]:
#from sklearn.metrics.pairwise import cosine_similarity
#def new_euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
#    return cosine_similarity(X,Y)

# monkey patch (ensure cosine dist function is used)
#from sklearn.cluster import k_means_k_means_.euclidean_distances
#k_means_.euclidean_distances = new_euclidean_distances 

SyntaxError: invalid syntax (<ipython-input-111-543d08913994>, line 6)

In [95]:
Cluster = sklearn.cluster.KMeans(n_clusters=2, random_state = 1)
clusterout = Cluster.fit(DTMReviews)

In [96]:
series_clusters = Series(clusterout.labels_)

In [97]:
sample['Cluster'] = series_clusters

In [98]:
test = pd.concat([sample, series_clusters], axis=1)

In [99]:
sample.head()

Unnamed: 0,stars,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,...,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others,Review,High,Cluster
0,4,1,0,2,0,1,0,0,0,0,...,1,0,0,0,0,0,0,I can t believe I m the first person to review...,1,1
1,5,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,We were told that this is the best place to ea...,1,0
2,5,15,11,15,0,1,0,0,0,0,...,0,0,0,0,0,0,1,After having visited the Tempe location I was...,1,0
3,5,2,1,3,0,1,0,0,1,0,...,0,0,0,1,0,0,0,Dick s and the related restaurant Richardson ...,1,1
4,5,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,My husband and I went to Roma Garden for dinn...,1,0


In [100]:
print "Confusion Matrix \n", pd.crosstab(index=sample['High'],columns=sample['Cluster'])
print "\nPercent Correct\n", round((sample['High']!=sample['Cluster']).mean()*100,3)
print "\nProportion Table\n", pd.crosstab(index=sample['High'],columns=sample['Cluster']).apply(lambda r: r/r.sum(), axis=1)

Confusion Matrix 
Cluster     0     1
High               
0        3425  3021
1        2287  4159

Percent Correct
41.173

Proportion Table
Cluster         0         1
High                       
0        0.531337  0.468663
1        0.354794  0.645206


##Part F