In [69]:
import pandas as pd
import numpy as np
from pandas import Series
from pandas import DataFrame
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import neighbors
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedShuffleSplit

In [14]:
yelp = pd.read_csv("yelp.csv")

In [15]:
high_mask = yelp['stars'] > 3
yelp['High'] = 0
yelp.ix[high_mask, 'High'] = 1

In [16]:
formula = 'High ~ 0 + votes_cool + votes_funny + votes_useful + Cheap + Moderate + Expensive  ' + \
' + VeryExpensive + American + Chinese + French + Japanese + Indian + Italian + Greek ' + \
' + Mediterranean + Mexican + Thai + Vietnamese + Others'

In [17]:
Y, X = dmatrices(formula, yelp, return_type='dataframe')
y = Y['High'].values

In [55]:
index = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.3, train_size = 0.7)

In [51]:
index

StratifiedShuffleSplit(labels=[ 0.  0.  1. ...,  1.  0.  1.], n_iter=1, test_size=0.3, random_state=None)

In [56]:
DF_X = DataFrame(X)

In [60]:
for train_index, test_index in index:
    print(train_index)
    X_train, X_test = DF_X.iloc[train_index,], DF_X.iloc[test_index,]
    y_train, y_test = y[train_index], y[test_index]

[18639  9783  2568 ...,  9408 10554  7261]


In [61]:
X_train.head()

Unnamed: 0,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,French,Japanese,Indian,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others
18639,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9783,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
2568,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
12730,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
18368,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [62]:
# Logistic Regression

logistic_model = LogisticRegression()
logistic_result = logistic_model.fit(X_train, y_train)

In [63]:
logistic_train_prediction = logistic_model.predict(X_train)
print metrics.accuracy_score(y_train, logistic_train_prediction)

0.684048860633


In [64]:
logistic_test_prediction = logistic_model.predict(X_test)
print metrics.accuracy_score(y_test, logistic_test_prediction)

0.682


In [65]:
# KNN

knn_model = neighbors.KNeighborsClassifier(n_neighbors=15, weights='uniform', p=2)
knn_result = knn_model.fit(X_train, y_train)

In [66]:
knn_train_prediction = knn_model.predict(X_train)
print metrics.accuracy_score(y_train, knn_train_prediction)

0.69312093721


In [67]:
knn_test_prediction = knn_model.predict(X_test)
print metrics.accuracy_score(y_test, knn_test_prediction)

0.674333333333


In [70]:
# Classification on text

np.random.seed(1234567)
train = yelp.sample(int(len(yelp)*0.7), replace=False)

In [113]:
test = yelp[~yelp.index.isin(train.index.values)]

In [71]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [72]:
vectorizer = TfidfVectorizer(min_df=2, smooth_idf=True, strip_accents='unicode', norm='l2')

In [73]:
def text_classification (v):
    X_transform=v.fit_transform(train_x)
    X_test=v.transform(test_x)
    
    nb_classifier = MultinomialNB().fit(X_transform, train_y)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [74]:
text_classification(vectorizer)

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.