In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV


data = pd.read_csv('reviews.csv', delimiter='\t')

In [28]:
data['RatingValue'].value_counts()

5    828
4    637
3    297
2     86
1     72
Name: RatingValue, dtype: int64

In [29]:
data['Sentiment'] = np.where(data['RatingValue']>=4, 2, np.where(data['RatingValue']==3, 1, 0))

In [30]:
data['Sentiment'].value_counts()

2    1465
1     297
0     158
Name: Sentiment, dtype: int64

In [31]:
data_remove = data[data.Sentiment==2].sample(frac=.8)

In [32]:
data_undersample=data.drop(data_remove.index)

In [33]:
data_undersample['Sentiment'].value_counts()

1    297
2    293
0    158
Name: Sentiment, dtype: int64

In [34]:
data_undersample = data_undersample[['Sentiment','Review']]
print(data_undersample)

      Sentiment                                             Review
0             2  I was tasked with finding a spot for a group d...
1             1  Went here with my friends and family. I liked ...
2             1  Surprisingly good Flautas! They came as 3 roll...
3             2  As a Mexican I always crave authentic Mexican ...
12            1  My friend and I decided on Mexican food for di...
...         ...                                                ...
1893          2  Yes I gave four stars for AYCE for the most af...
1895          0  NEVER ORDER FROM THIS RESTAURANT! We order fro...
1901          2  Got delivery from here. Ordered the Malaysian ...
1907          2  Friendly service, quick delivery and delicious...
1915          0  Maybe I didn't know what to expect but the win...

[748 rows x 2 columns]


In [35]:
train,valid = train_test_split(data_undersample, test_size=0.2, random_state=42)
train=train.reset_index(drop='True')
valid=valid.reset_index(drop='True')

train.to_csv('train.csv')
valid.to_csv('valid.csv')

In [36]:
data_train = pd.read_csv('train.csv')
data_valid = pd.read_csv('valid.csv')

data_train = data_train[['Sentiment','Review']]
data_valid = data_valid[['Sentiment','Review']]

In [37]:
print(data_valid)

     Sentiment                                             Review
0            2  Amazing food, really flavourful and especially...
1            2  My girlfriend and I have never been to Toronto...
2            1  I came here to try out some tacos.  The tacos ...
3            2  I wanted to give this pace 5* because of their...
4            1  The only reason I visited was because I had a ...
..         ...                                                ...
145          2  What I got: Eggplant with beef ($14), Chili fi...
146          1  We went on a Friday night and the wait was 40 ...
147          2  This ramen place is always reliable when you'r...
148          0  Came here to celebrate my Friends Birthday, Fo...
149          1  I came here for a birthday dinner on Saturday ...

[150 rows x 2 columns]


In [23]:
data_train['Sentiment'].value_counts()

1    239
2    228
0    131
Name: Sentiment, dtype: int64

In [24]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(data_train['Review'], data_train['Sentiment'])
prediction = text_clf.predict(data_valid['Review'])

In [25]:
accuracy=np.mean(prediction == data_valid['Sentiment'])

In [26]:
f1score=metrics.f1_score(data_valid['Sentiment'], prediction, average='weighted')

In [27]:
confusion=pd.DataFrame(metrics.confusion_matrix(data_valid['Sentiment'], prediction), index=['negative', 'neutral', 'positive'],columns=['negative', 'neutral', 'positive'])

In [28]:
print("accuracy:", accuracy)
print("")
print("F1_score:", f1score)
print("")
print("Confusion_matrix:")
print(confusion)

accuracy: 0.68

F1_score: 0.6792477505754342

Confusion_matrix:
          negative  neutral  positive
negative        17       10         4
neutral          6       37        13
positive         0       15        48


In [29]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [30]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [31]:
gs_clf = gs_clf.fit(data_train['Review'], data_train['Sentiment'])

In [32]:
gs_clf.best_score_

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [33]:
gs_clf.best_score_

0.6304201680672269

In [34]:
prediction2 = gs_clf.predict(data_valid['Review'])

In [35]:
accuracy2=np.mean(prediction2 == data_valid['Sentiment'])

In [36]:
f1score2=metrics.f1_score(data_valid['Sentiment'], prediction2, average='weighted')

In [37]:
confusion2=pd.DataFrame(metrics.confusion_matrix(data_valid['Sentiment'], prediction2), index=['negative', 'neutral', 'positive'],columns=['negative', 'neutral', 'positive'])

In [38]:
print("accuracy:", accuracy2)
print("")
print("F1_score:", f1score2)
print("")
print("Confusion_matrix:")
print(confusion2)

accuracy: 0.6666666666666666

F1_score: 0.6592418300653594

Confusion_matrix:
          negative  neutral  positive
negative        12       13         6
neutral          2       46         8
positive         0       21        42
