# Explore here

In [3]:
import numpy as np
import pandas as pd
import re
import contractions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import GridSearchCV

In [4]:
# Load csv
reviews_data = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')

In [5]:
reviews_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [6]:
reviews_data = reviews_data.drop('package_name', axis= 1)
reviews_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [7]:
reviews_data["review"] = reviews_data["review"].str.strip().str.lower()
reviews_data["review"] = reviews_data["review"].apply(contractions.fix)
reviews_data["review"] = reviews_data['review'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
print(reviews_data.head(10))

                                              review  polarity
0  privacy at least put some option appear offlin...         0
1  messenger issues ever since the last update in...         0
2  profile any time my wife or anybody has more t...         0
3  the new features suck for those of us who do n...         0
4  forced reload on uploading pic on replying com...         0
5  i do not know i cannot edit my posts things su...         0
6  major flaws constant updates and always gettin...         0
7  video issues since i was forced into this upda...         0
8  this update completely destroyed my facebook i...         0
9  posting issues for the last week there is been...         0


In [8]:
X = reviews_data['review']
y = reviews_data['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)
X_train.head()

309    video capture for moment where did video momen...
469    help me out i am not able to send or receive a...
593    makes life better evernote has saved my bacon ...
62     following back it does not let me follow peopl...
500    very useful thank god i can finally reply fast...
Name: review, dtype: object

In [9]:
vec_model = CountVectorizer(stop_words = "english")

X_train = vec_model.fit_transform(X_train)
X_test = vec_model.transform(X_test)

In [10]:
X_train.shape

(712, 3375)

In [11]:
# Obtain the characteristics
characteristics = vec_model.get_feature_names_out()
characteristics


array(['000', '10', '100', ..., 'zespole', 'zoom', 'zooming'],
      dtype=object)

Train the model

In [12]:
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

y_pred = model_nb.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8268156424581006
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.92      0.89       133
           1       0.70      0.57      0.63        46

    accuracy                           0.83       179
   macro avg       0.78      0.74      0.76       179
weighted avg       0.82      0.83      0.82       179



Optimize hyperparameters

In [14]:
model_grid = MultinomialNB()

param_grid = {
    'force_alpha': [True, False],
    'alpha': np.linspace(0.01, 10.0, 200),
    'fit_prior': [True, False],
    'class_prior': [None, [0.5, 0.5], [0.3, 0.7], [0.2, 0.8], [0.1, 0.9]],
}

grid_search = GridSearchCV(model_grid, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)
print(f'Grid search best params: {grid_search.best_params_}')

Grid search best params: {'alpha': 8.242964824120603, 'class_prior': [0.3, 0.7], 'fit_prior': True, 'force_alpha': True}


In [15]:
model_grid_nb = MultinomialNB(alpha = 8.242964824120603, class_prior = [0.3, 0.7], fit_prior = True, force_alpha = True)
model_grid_nb.fit(X_train, y_train)

y_pred_grid = model_grid_nb.predict(X_test)

In [16]:
accuracy_grid = accuracy_score(y_test, y_pred)
report_grid = classification_report(y_test, y_pred)

print("New accuracy:", accuracy_grid)
print("New classification Report:\n", report_grid)

New accuracy: 0.8268156424581006
New classification Report:
               precision    recall  f1-score   support

           0       0.86      0.92      0.89       133
           1       0.70      0.57      0.63        46

    accuracy                           0.83       179
   macro avg       0.78      0.74      0.76       179
weighted avg       0.82      0.83      0.82       179



## Conclusion:
After exploring other options, the multinomial was the best result.