# Naive Bayes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
import pickle

## Import data and create dataframe

In [None]:
df = pd.read_csv("/workspaces/josefina-aispuro-merelles-machine-learning/data/raw/playstore_reviews.csv")
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


## Descriptive analysis

In [None]:
# Check the dimensions
df.shape

(891, 3)

In [None]:
# Data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [None]:
# Looking for duplicated data
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

In [None]:
# Dropping irrelevant information
df.drop(["package_name"], axis = 1, inplace = True)

In [None]:
# Remove extra spaces and convert the text to lowercase
df["review"] = df["review"].str.strip().str.lower()

## Train / Test Split

In [None]:
# Define X (features) and y (target)
X = df["review"]
y = df["polarity"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Vectorize the text

In [None]:
# Convert the text reviews into numerical features using Bag of Words
vec_model = CountVectorizer(stop_words="english")
X_train_vect = vec_model.fit_transform(X_train).toarray()
X_test_vect = vec_model.transform(X_test).toarray()

## Creating model and training data - MultinomialNB

In [None]:
model_m = MultinomialNB()
model_m.fit(X_train_vect, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


## Predictions

In [None]:
# Make predictions on the test set
y_pred = model_m.predict(X_test_vect)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [None]:
# Evaluate accuracy
model_m_accuracy = accuracy_score(y_test, y_pred)
model_m_accuracy

0.8156424581005587

### Observations
The decision tree model achieved 81.56% accuracy on the test set.

## Optimization

In [None]:
# Hyperparameters to test
hyperparameters = {"alpha": np.linspace(0.01, 10.0, 200), "fit_prior": [True, False]}

# Initialize the grid search
grid_search_m = GridSearchCV(model_m, hyperparameters, scoring="accuracy", cv=5, n_jobs=-1)
grid_search_m

0,1,2
,estimator,MultinomialNB()
,param_grid,"{'alpha': array([ 0.01 ... 10. ]), 'fit_prior': [True, False]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [None]:
# Train Grid Search
grid_search_m.fit(X_train_vect, y_train)

0,1,2
,estimator,MultinomialNB()
,param_grid,"{'alpha': array([ 0.01 ... 10. ]), 'fit_prior': [True, False]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(1.8172361809045228)
,force_alpha,True
,fit_prior,False
,class_prior,


In [None]:
# Looking for best parameters
grid_search_m.best_params_

{'alpha': np.float64(1.8172361809045228), 'fit_prior': False}

In [None]:
# Evaluate optimized model on the test set
grid_model_m = grid_search_m.best_estimator_
test_accuracy = grid_model_m.score(X_test_vect, y_test)
test_accuracy

0.8156424581005587

### Observations
After hyperparameter tuning, the result remains the same.

## Creating model and training data - GaussianNB

In [None]:
model_g = GaussianNB()
model_g.fit(X_train_vect, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


## Predictions

In [None]:
# Make predictions on the test set
y_pred = model_g.predict(X_test_vect)
y_pred

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0])

In [None]:
# Evaluate accuracy
model_g_accuracy = accuracy_score(y_test, y_pred)
model_g_accuracy

0.8044692737430168

### Observations
The decision tree model achieved 80.44% accuracy on the test set.

## Optimization

In [None]:
# Hyperparameters to test
hyperparameters = {"var_smoothing": np.logspace(-9, -1, 20)}

# Initialize the grid search
grid_search_g = GridSearchCV(model_g, hyperparameters, scoring="accuracy", cv=5, n_jobs=-1)
grid_search_g

0,1,2
,estimator,GaussianNB()
,param_grid,{'var_smoothing': array([1.0000...00000000e-01])}
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,priors,
,var_smoothing,1e-09


In [None]:
# Train Grid Search
grid_search_g.fit(X_train_vect, y_train)

0,1,2
,estimator,GaussianNB()
,param_grid,{'var_smoothing': array([1.0000...00000000e-01])}
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,priors,
,var_smoothing,np.float64(0....3514416313134)


In [None]:
# Looking for best parameters
grid_search_g.best_params_

{'var_smoothing': np.float64(0.00029763514416313134)}

In [None]:
# Evaluate optimized model on the test set
grid_model_g = grid_search_g.best_estimator_
test_accuracy = grid_model_g.score(X_test_vect, y_test)
test_accuracy

0.8044692737430168

### Observations
After hyperparameter tuning, the result remains the same.

## Saving model

In [None]:
with open("/workspaces/josefina-aispuro-merelles-machine-learning/models/multinomial_nb_model.pkl", "wb") as file:
    pickle.dump(grid_model_m, file)

## Final conclusion
After testing different Naive Bayes models on the dataset, MultinomialNB proved to be the most effective, achieving an accuracy of 81.56%, compared to GaussianNB with 80.45%. After optimizing both models using their respective hyperparameters, neither model showed any improvement. Their performance remained the same. This indicates that MultinomialNB is the best choice for this task, and further hyperparameter tuning does not yield additional gains.