# Using review to predict Rating
In this notebook, I will try to use the reviews in the "Amazon_Unlocked_Mobile.csv" to predict the Ratings

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

In [3]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [4]:
len(df)

413840

In [5]:
df.dropna(inplace = True)

In [6]:
len(df)

334335

In [7]:
#regard rating 3 as neutral and drop it
df=df[df['Rating']!= 3]
len(df)

308277

In [8]:
df['Positive Review']= df['Rating']>3

In [9]:
df['Positive Review'] = df['Positive Review'].apply(int)

In [10]:
X = df['Reviews']
y = df['Positive Review']

In [11]:
y.mean()

0.7482686025879323

# Use CountVectorizer and LogisticRegression

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 42)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vect = CountVectorizer().fit(X_train)

In [16]:
X_train_vectorized = vect.transform(X_train)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
lr_clf = LogisticRegression()
lr_clf.fit(X_train_vectorized,y_train)
y_pred = lr_clf.predict(vect.transform(X_test))
print("AUC score: ", roc_auc_score(y_pred,y_test))

AUC score:  0.940570489366771


In [18]:
lr_clf.predict(vect.transform(['good','not good','bad','not bad']))

array([1, 1, 0, 0], dtype=int64)

we can see that our model will ignore 'not', so maybe we can use 2-gram to improve the performance of our model
# Use 2-gram

In [37]:
lr_clf1 = LogisticRegression()
vect = CountVectorizer(ngram_range=(1,2)).fit(X_train)
X_train_vect = vect.transform(X_train)
lr_clf1.fit(X_train_vect,y_train)
y_pred1= lr_clf1.predict(vect.transform(X_test))
print("AUC score using 2-gram :", roc_auc_score(y_test,y_pred1))

AUC score using 2-gram : 0.9650655031770067


In [20]:
lr_clf.predict(vect.transform(['good','not good','bad','not bad']))

array([1, 0, 0, 1], dtype=int64)

Can  we use the min_df para in CountVectorizer to improve the performance?

In [21]:
vect = CountVectorizer(ngram_range=(1,2),min_df = 5).fit(X_train)

In [22]:
X_train_vect2 = vect.transform(X_train)
lr_clf.fit(X_train_vect2,y_train)
y_pred = lr_clf.predict(vect.transform(X_test))
print("AUC score using 2-gram and min_df =5 :", roc_auc_score(y_test,y_pred))

AUC score using 2-gram and min_df =5 : 0.964181812584712


In [23]:
print("AUC score using 2-gram and min_df =5 :", roc_auc_score(y_train,lr_clf.predict(X_train_vect2)))

AUC score using 2-gram and min_df =5 : 0.9866816810439873


# Use TFIDF to Do Feature Extraction 

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
lr_clf2 = LogisticRegression()
vect1 = TfidfVectorizer(ngram_range=(1,2),min_df= 5)
X_train_tfidf = vect1.fit_transform(X_train)
lr_clf2.fit(X_train_tfidf,y_train)
y_pred2 = lr_clf2.predict(vect1.transform(X_test))
print("AUC score using 2-gram and min_df =5 (TFIDF):", roc_auc_score(y_test,y_pred2))

AUC score using 2-gram and min_df =5 (TFIDF): 0.9488645714453975


# Combine CountVectorizer and TfidfVectorizer

In [29]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [40]:
from scipy.sparse import hstack
X_combined = hstack((vect.transform(X_train),vect1.transform(X_train)))

In [41]:
lr_clf3 = LogisticRegression()
lr_clf3.fit(X_combined,y_train)
X_test_combined = hstack((vect.transform(X_test),vect1.transform(X_test)))
y_pred3 = lr_clf3.predict(X_test_combined)
print("AUC score using combined feature:", roc_auc_score(y_test,y_pred3))

AUC score using combined feature: 0.9705983578252696


# Ensemble All Three Classifier

In [44]:
y = y_pred1+y_pred2+y_pred3

In [46]:
y_voting = [int(x>1) for x in y]

In [48]:
print("AUC score using voting classifer:", roc_auc_score(y_test,y_voting))

AUC score using voting classifer: 0.9673473488657627


# the combined feature classifier outperforms other classifies