In [1]:
# usual imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


%matplotlib notebook

from sklearn.cross_validation import train_test_split

# Each is a different implemntation of a text transform tool: Bag of Words & Tfidf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### read yelp_labelled data and split it using \t

In [2]:
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/yelp_labelled.txt"


#### Put your yelp data into a dataframe and drop na values.

In [3]:
yelp_data = pd.read_csv(url, sep = "\t", names = ['text', 'sentiment'])

yelp_data.dropna(inplace = True)

yelp_data.head()

Unnamed: 0,text,sentiment
0,Wow... Loved this place.,1.0
3,Crust is not good.,0.0
4,Not tasty and the texture was just nasty.,0.0
10,Stopped by during the late May bank holiday of...,1.0
11,The selection on the menu was great and so wer...,1.0


In [4]:
len(yelp_data)

1000

#### Using Pipeline, RandomForestClasifier, and GridSearchCV, play with min_df and max_df on your yelp data. Split your data to test and training. You can use either of CountVetorizer or TfidfVectorizer

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

In [6]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report


In [7]:
X_train, X_test, y_train, y_test = train_test_split(yelp_data['text'], 
                                                   yelp_data['sentiment'], 
                                                   test_size=0.2)

In [8]:
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('clf', RandomForestClassifier())])

In [9]:
parameters = {'vect__min_df' : [1,2,3,4,5],
             'vect__max_df' : [10,100,200, 500, 750, 1000],
             'clf__n_estimators' : [1000]}

gs_clf = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1)

In [10]:
fit_grid = gs_clf.fit(X_train, y_train)

fit_grid.score(X_test, y_test)

0.71999999999999997

In [14]:
fit_grid.best_params_

{'clf__n_estimators': 1000, 'vect__max_df': 200, 'vect__min_df': 1}

In [15]:
#now with tfidf vectorizer
text_clf = Pipeline([('vect', TfidfVectorizer()),
                    ('clf', RandomForestClassifier())])

In [16]:
parameters = {'vect__min_df' : [1,2,3,4,5],
             'vect__max_df' : [10,100,200, 500, 750, 1000],
             'clf__n_estimators' : [1000]}

gs_clf = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1)

In [17]:
fit_grid = gs_clf.fit(X_train, y_train)

fit_grid.score(X_test, y_test)

0.745

In [18]:
fit_grid.best_params_

{'clf__n_estimators': 1000, 'vect__max_df': 100, 'vect__min_df': 2}

#### How much test error do you get based on the optimizer you found above?

28% error... which doesn't seem all that great. *sigh*

#### Look over few (first 5) X_test instances and compare the category predicted for the observation and the actual review sentence. 

In [11]:
for i in range(5):
    print(fit_grid.predict(X_test)[i])
    print(X_test.values[i])

0.0
I ate there twice on my last visit, and especially enjoyed the salmon salad.
0.0
The guys all had steaks, and our steak loving son who has had steak at the best and worst places said it was the best steak he's ever eaten.
0.0
This place is overpriced, not consistent with their boba, and it really is OVERPRICED!
1.0
DELICIOUS!!
0.0
I'm not really sure how Joey's was voted best hot dog in the Valley by readers of Phoenix Magazine.


## Bonus Quetions: Can you find the test instances that are correctly classified and thos that are misclassified?

In [12]:
#Misclassified instances



In [13]:
#Correctly Classified instances
