In [2]:
import pandas as pd
import numpy as np
from pandas import Series
from pandas import DataFrame
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import neighbors
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedShuffleSplit
import scipy.sparse

In [3]:
yelp = pd.read_csv("yelp.csv")

In [4]:
high_mask = yelp['stars'] > 3
yelp['High'] = 0
yelp.ix[high_mask, 'High'] = 1

#Task A

In [5]:
formula = 'High ~ 0 + votes_cool + votes_funny + votes_useful + Cheap + Moderate + Expensive  ' + \
' + VeryExpensive + American + Chinese + French + Japanese + Indian + Italian + Greek ' + \
' + Mediterranean + Mexican + Thai + Vietnamese + Others'

In [6]:
Y, X = dmatrices(formula, yelp, return_type='dataframe')
y = Y['High'].values

In [7]:
index = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.3, train_size = 0.7)

In [8]:
index

StratifiedShuffleSplit(labels=[ 0.  0.  1. ...,  1.  0.  1.], n_iter=1, test_size=0.3, random_state=None)

In [9]:
DF_X = DataFrame(X)

In [10]:
for train_index, test_index in index:
    print(train_index)
    X_train, X_test = DF_X.iloc[train_index,], DF_X.iloc[test_index,]
    y_train, y_test = y[train_index], y[test_index]

[16802  5766  1304 ...,  5371 10461  3175]


In [11]:
X_train.head()

Unnamed: 0,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,French,Japanese,Indian,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others
16802,2,1,3,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5766,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1304,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9595,4,3,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1866,0,0,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


###Logistic Regression

In [35]:
logistic_model = LogisticRegression()
logistic_result = logistic_model.fit(X_train, y_train)

In [13]:
logistic_train_prediction = logistic_model.predict(X_train)
print metrics.accuracy_score(y_train, logistic_train_prediction)

0.685048932067


In [14]:
logistic_test_prediction = logistic_model.predict(X_test)
print metrics.accuracy_score(y_test, logistic_test_prediction)

0.681


###KNN

In [36]:
knn_model = neighbors.KNeighborsClassifier(n_neighbors=15, weights='uniform', p=2)
knn_result = knn_model.fit(X_train, y_train)

In [16]:
knn_train_prediction = knn_model.predict(X_train)
print metrics.accuracy_score(y_train, knn_train_prediction)

0.651617972712


In [17]:
knn_test_prediction = knn_model.predict(X_test)
print metrics.accuracy_score(y_test, knn_test_prediction)

0.630166666667


#Task B: Classification on Text

###Random Sampling

In [None]:
np.random.seed(1234567)
train = yelp.sample(int(len(yelp)*0.7), replace=False)

In [19]:
test = yelp[~yelp.index.isin(train.index.values)]

In [20]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [129]:
vectorizer = TfidfVectorizer(min_df=0,smooth_idf=True, strip_accents='unicode', norm='l2')

In [80]:
def text_classification (v):
    X_transform=v.fit_transform(train_x)
    X_test=v.transform(test_x)
    
    nb_classifier = MultinomialNB().fit(X_transform, train_y)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    predict_y=Series(y_nb_predicted).reset_index()[0]
    df=pd.DataFrame()
    df['Predicted']=predict_y
    df['Actual']=test_y.reset_index()['High']
    
    print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
    print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
    print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

In [23]:
text_classification(vectorizer)

Percent Correct
71.933

Confusion Matrix
Predicted    0     1
Actual              
0          312  1679
1            5  4004

Proportion Table
Predicted         0         1
Actual                       
0          0.156705  0.843295
1          0.001247  0.998753


###Undersampling data set to ensure 50/50 split

In [24]:
highs=yelp[yelp['High']==1]
lows=yelp[yelp['High']==0]

In [26]:
sample_high=highs.sample(len(lows),replace=False).copy()

In [28]:
sample=sample_high.append(lows, ignore_index=True)

In [32]:
train=sample.sample(int(0.7*len(sample)),replace=False).copy()
test=sample[~sample.index.isin(train.index.values)].copy()

In [37]:
train_x = train['Review']
train_y = train['High']
test_x = test['Review']
test_y = test['High']

In [34]:
text_classification(vectorizer)

Percent Correct
82.575

Confusion Matrix
Predicted     0     1
Actual               
0          1665   289
1           385  1529

Proportion Table
Predicted         0         1
Actual                       
0          0.852098  0.147902
1          0.201149  0.798851


#Task C

In [130]:
X_transform=vectorizer.fit_transform(train_x) #just the reviews
sam=train.drop(["Review", "stars"], axis=1).copy()
samsparse=scipy.sparse.csr_matrix(sam.to_sparse()) #sparsing the numeric
surprise=scipy.sparse.hstack([X_transform, samsparse]) #combining numeric and text

In [131]:
X_test=vectorizer.transform(test_x) #repeating above for test
samt=test.drop(["Review", "stars"], axis=1).copy() #removing irrelevant
samsparset=scipy.sparse.csr_matrix(samt.to_sparse()) #sparsing numeric 
surpriset=scipy.sparse.hstack([X_test, samsparset]) #combining numeric and text

In [132]:
nb_classifier = MultinomialNB().fit(surprise, train_y)
y_nb_predicted = nb_classifier.predict(surpriset)
    
predict_y=Series(y_nb_predicted).reset_index()[0]
df=pd.DataFrame()
df['Predicted']=predict_y
df['Actual']=test_y.reset_index()['High']

In [133]:
print "Percent Correct\n",round((df['Predicted']==df['Actual']).mean()*100,3)
print "\nConfusion Matrix\n",pd.crosstab(index=df['Actual'],columns=df['Predicted'])
print "\nProportion Table\n", pd.crosstab(index=df['Actual'],columns=df['Predicted']).apply(lambda r: r/r.sum(), axis=1)

Percent Correct
97.828

Confusion Matrix
Predicted     0     1
Actual               
0          1870    84
1             0  1914

Proportion Table
Predicted         0         1
Actual                       
0          0.957011  0.042989
1          0.000000  1.000000


In [None]:
y_nb_predicted = nb_classifier.predict(surprise)
predict_y=Series(y_nb_predicted).reset_index()[0]
df=pd.DataFrame()
df['Predicted']=predict_y
df['Actual']=trai_y.reset_index()['High']