In [2]:
import pandas as pd
import numpy as np
from pandas import Series
from pandas import DataFrame
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import neighbors
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedShuffleSplit

In [3]:
yelp = pd.read_csv("yelp.csv")

In [4]:
high_mask = yelp['stars'] > 3
yelp['High'] = 0
yelp.ix[high_mask, 'High'] = 1

In [5]:
formula = 'High ~ 0 + votes_cool + votes_funny + votes_useful + Cheap + Moderate + Expensive  ' + \
' + VeryExpensive + American + Chinese + French + Japanese + Indian + Italian + Greek ' + \
' + Mediterranean + Mexican + Thai + Vietnamese + Others'

In [6]:
Y, X = dmatrices(formula, yelp, return_type='dataframe')
y = Y['High'].values

In [7]:
index = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.3, train_size = 0.7)

In [8]:
index

StratifiedShuffleSplit(labels=[ 0.  0.  1. ...,  1.  0.  1.], n_iter=1, test_size=0.3, random_state=None)

In [9]:
DF_X = DataFrame(X)

In [10]:
for train_index, test_index in index:
    print(train_index)
    X_train, X_test = DF_X.iloc[train_index,], DF_X.iloc[test_index,]
    y_train, y_test = y[train_index], y[test_index]

[16802  5766  1304 ...,  5371 10461  3175]


In [11]:
X_train.head()

Unnamed: 0,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,French,Japanese,Indian,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others
16802,2,1,3,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5766,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1304,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9595,4,3,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1866,0,0,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [12]:
# Logistic Regression

logistic_model = LogisticRegression()
logistic_result = logistic_model.fit(X_train, y_train)

In [13]:
logistic_train_prediction = logistic_model.predict(X_train)
print metrics.accuracy_score(y_train, logistic_train_prediction)

0.685048932067


In [14]:
logistic_test_prediction = logistic_model.predict(X_test)
print metrics.accuracy_score(y_test, logistic_test_prediction)

0.681


In [15]:
# KNN

knn_model = neighbors.KNeighborsClassifier(n_neighbors=15, weights='uniform', p=2)
knn_result = knn_model.fit(X_train, y_train)

In [16]:
knn_train_prediction = knn_model.predict(X_train)
print metrics.accuracy_score(y_train, knn_train_prediction)

0.651617972712


In [17]:
knn_test_prediction = knn_model.predict(X_test)
print metrics.accuracy_score(y_test, knn_test_prediction)

0.630166666667


In [18]:
# Classification on text

np.random.seed(1234567)
train = yelp.sample(int(len(yelp)*0.7), replace=False)

In [19]:
test = yelp[~yelp.index.isin(train.index.values)]

In [20]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [21]:
vectorizer = TfidfVectorizer(min_df=2, smooth_idf=True, strip_accents='unicode', norm='l2')

In [22]:
def text_classification (v):
    X_transform=v.fit_transform(train_x)
    X_test=v.transform(test_x)
    
    nb_classifier = MultinomialNB().fit(X_transform, train_y)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [23]:
text_classification(vectorizer)

Percent Correct
71.933

Confusion Matrix
Predicted    0     1
Actual              
0          312  1679
1            5  4004

Proportion Table
Predicted         0         1
Actual                       
0          0.156705  0.843295
1          0.001247  0.998753


#Julia's addition

Oversampling data set to ensure 50/50 split

In [24]:
highs=yelp[yelp['High']==1]
lows=yelp[yelp['High']==0]

In [26]:
sample_high=highs.sample(len(lows),replace=False).copy()

In [28]:
sample=sample_high.append(lows, ignore_index=True)

In [30]:
sample.head(10)

Unnamed: 0,stars,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,...,Indian,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others,Review,High
0,5,0,0,2,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,My favorite pizza place. My girlfriend is alwa...,1
1,5,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,I went for my birthday and I enjoyed the atmos...,1
2,4,1,0,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,So my boyfriend and I went here after a horrib...,1
3,4,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,Mmmmmm... mmmmm... mmmmmmm...That was the soun...,1
4,4,6,4,7,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,At the recommendation of a friend I met a cli...,1
5,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,One of my favorite places to go when I eat out...,1
6,4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,Gotta respond to Dee M. and her critique of th...,1
7,5,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,So great that Potbelly has opened in Phoenix. ...,1
8,4,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,Modern Burger at the airport! Great breakfast ...,1
9,4,0,0,3,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,I love this Chinese restaurant. Being raised i...,1


In [32]:
train=sample.sample(int(0.7*len(sample)),replace=False).copy()
test=sample[~sample.index.isin(train.index.values)].copy()

In [33]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [34]:
text_classification(vectorizer)

Percent Correct
82.575

Confusion Matrix
Predicted     0     1
Actual               
0          1665   289
1           385  1529

Proportion Table
Predicted         0         1
Actual                       
0          0.852098  0.147902
1          0.201149  0.798851
