In [11]:
import pandas as pd 
import numpy as np 

path = '../Datasets/cleaned_data.csv'
df = pd.read_csv(path)

df.head()

Unnamed: 0,rating,feedback,clean_reviews,Positive,Negative,Neutral,reviews_length
0,5,1,love echo,0.808,0.0,0.192,9
1,5,1,love,1.0,0.0,0.0,4
2,4,1,sometim play game answer question correct alex...,0.223,0.141,0.636,99
3,5,1,lot fun thing yr old learn dinosaur control l...,0.564,0.0,0.436,101
4,5,1,music,0.0,0.0,1.0,5


### Splitting

In [38]:
from sklearn.model_selection import train_test_split

X = df.loc[:,'clean_reviews']
y = df.loc[:,'rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=0)
X_train.shape

(2520,)

### Vectorization

<img src='https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/_images/text-representation-bow.gif' width='300' height='200' style="float: left;margin:5px 20px 5px 1px">  
blablablablablablablablablablablablablablab

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
X_train_vector = cv.fit_transform(X_train.values.astype('U'))
X_test_vector = cv.transform(X_test)
X_train_vector.shape, X_test_vector.shape

((2520, 2880), (630, 2880))

In [42]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidfTransformer = TfidfTransformer()
tfidfTransformer.fit(X_train_vector)
tfidfTransformer.transform(X_train_vector)

<2520x2880 sparse matrix of type '<class 'numpy.float64'>'
	with 28880 stored elements in Compressed Sparse Row format>

### Building Machine Learning Model

In [43]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

### Fitting and predicting the model

In [44]:
%time nb.fit(X_train_vector, y_train)

CPU times: total: 31.2 ms
Wall time: 4.99 ms


In [45]:
y_preds = nb.predict(X_test_vector)

### Accuracy and Error Metrics

In [59]:
from sklearn import metrics

accuracy = metrics.accuracy_score(y_test, y_preds)
mae = metrics.mean_absolute_error(y_test, y_preds)
mape = metrics.mean_absolute_percentage_error(y_test, y_preds)

print(f'''
Sklearn Accuracy Score: {(accuracy*100):.2f} \n
Mean Absolute Root Error: {np.sqrt(mae*100):.2f} \n
Mean Absolute Percentage Error: {(mape*100):.2f}
''')


Sklearn Accuracy Score: 74.44 

Mean Absolute Root Error: 6.64 

Mean Absolute Percentage Error: 22.90



### Confusion Metrics

In [50]:
metrics.confusion_matrix(y_test, y_preds)

array([[ 12,   0,   1,   5,  15],
       [  1,   0,   1,   7,  12],
       [  1,   0,   6,   8,  24],
       [  1,   0,   0,  17,  72],
       [  0,   1,   1,  11, 434]], dtype=int64)

### Pipeline

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', MultinomialNB())])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)