In [5]:
import pandas as pd
from pandas import Series
from pandas import DataFrame
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import neighbors
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [8]:
yelp = pd.read_csv("yelp.csv")

In [9]:
high_mask = yelp['stars'] > 3
yelp['High'] = 0
yelp.ix[high_mask, 'High'] = 1

In [10]:
formula = 'High ~ 0 + votes_cool + votes_funny + votes_useful + Cheap + Moderate + Expensive  ' + \
' + VeryExpensive + American + Chinese + French + Japanese + Indian + Italian + Greek ' + \
' + Mediterranean + Mexican + Thai + Vietnamese + Others'

In [11]:
Y, X = dmatrices(formula, yelp, return_type='dataframe')
y = Y['High'].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [13]:
# Logistic Regression

logistic_model = LogisticRegression()
logistic_result = logistic_model.fit(X_train, y_train)

In [14]:
logistic_train_prediction = logistic_model.predict(X_train)
print metrics.accuracy_score(y_train, logistic_train_prediction)

0.684977498393


In [15]:
logistic_test_prediction = logistic_model.predict(X_test)
print metrics.accuracy_score(y_test, logistic_test_prediction)

0.685


In [16]:
# KNN

knn_model = neighbors.KNeighborsClassifier(n_neighbors=15, weights='uniform', p=2)
knn_result = knn_model.fit(X_train, y_train)

In [17]:
knn_train_prediction = knn_model.predict(X_train)
print metrics.accuracy_score(y_train, knn_train_prediction)

0.687406243303


In [18]:
knn_test_prediction = knn_model.predict(X_test)
print metrics.accuracy_score(y_test, knn_test_prediction)

0.6665


In [19]:
# Classification on text

np.random.seed(1234567)
train = yelp.sample(int(len(yelp)*0.7), replace=False)

In [20]:
test = yelp[~yelp.index.isin(train.index.values)]

In [21]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [22]:
vectorizer = TfidfVectorizer(min_df=2, smooth_idf=True, strip_accents='unicode', norm='l2')

In [23]:
def text_classification (v):
    X_transform=v.fit_transform(train_x)
    X_test=v.transform(test_x)
    
    nb_classifier = MultinomialNB().fit(X_transform, train_y)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [24]:
text_classification(vectorizer)

Percent Correct
72.083

Confusion Matrix
Predicted    0     1
Actual              
0          309  1668
1            7  4016

Proportion Table
Predicted         0         1
Actual                       
0          0.156297  0.843703
1          0.001740  0.998260


In [25]:
train_y.value_counts()

1    9530
0    4469
dtype: int64

In [26]:
test_y.value_counts()

1    4023
0    1977
dtype: int64

In [69]:
# Text Classification with 50/50 splits

highs = yelp[yelp['High']==1]
lows = yelp[yelp['High']==0]

In [77]:
sample_highs = highs.sample(6446, replace=False).copy()
sample_lows = lows.sample(6446, replace=False).copy()

In [78]:
sample = sample_highs.append(sample_lows, ignore_index=False)

In [79]:
train = sample.sample(int(0.7*len(sample)), replace=False).copy()
test = sample[~sample.index.isin(train.index.values)]

In [80]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [81]:
vectorizer = TfidfVectorizer(min_df=2, smooth_idf=True, strip_accents='unicode', norm='l2')

In [82]:
def text_classification (v):
    X_transform=v.fit_transform(train_x)
    X_test=v.transform(test_x)
    
    nb_classifier = MultinomialNB().fit(X_transform, train_y)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [83]:
text_classification(vectorizer)

Percent Correct
81.903

Confusion Matrix
Predicted     0     1
Actual               
0          1608   323
1           377  1560

Proportion Table
Predicted         0         1
Actual                       
0          0.832729  0.167271
1          0.194631  0.805369
