### Imported libreries

In [85]:
# Data manipulation

import pandas as pd
import numpy as np

# Model preparating 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


# Modeling

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Warning manage

import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message="Inconsistent values: penalty=l1 with l1_ratio=0.0")
warnings.filterwarnings("ignore", category=UserWarning)


### Data cleaning

In [86]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

df.to_excel("../data/raw/df.xlsx", index = False)

df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [87]:
rows, columns = df.shape
print(f"The dimensions of this dataset are {rows} rows and {columns} columns")

The dimensions of this dataset are 891 rows and 3 columns


In [88]:
df.drop(["package_name"], axis= 1, inplace= True)
rows, columns = df.shape
df["review"] = df["review"].str.strip().str.lower()
print(f"The dimensions of this dataset are {rows} rows and {columns} columns")

The dimensions of this dataset are 891 rows and 2 columns


In [89]:
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


### Model preparating

In [90]:
x = df["review"]
y = df["polarity"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 17)

x_train.head()

873    such an awesome game love it a really fun game...
828                    dami xa hajur harule ni hernu hai
99     updated version is down not able to sent conne...
523    rubbish the amount of memory it gives for free...
132    every time i play the moon struck game, it doe...
Name: review, dtype: str

In [None]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(x_train).toarray()
X_test = vec_model.transform(x_test).toarray()

#### MultinomialNB 

In [108]:
mnb  = MultinomialNB()
mnb.fit(X_train, y_train)

y_pred_mnb = mnb.predict(X_test)

print("========  MultinomialNB  ========")
print("Accuracy:", accuracy_score(y_test, y_pred_mnb))
print("Precision:", precision_score(y_test, y_pred_mnb))
print("Recall:", recall_score(y_test, y_pred_mnb))
print("F1-score:", f1_score(y_test, y_pred_mnb))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_mnb))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_mnb))

Accuracy: 0.7541899441340782
Precision: 0.7916666666666666
Recall: 0.5277777777777778
F1-score: 0.6333333333333333

Confusion Matrix:
[[97 10]
 [34 38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.91      0.82       107
           1       0.79      0.53      0.63        72

    accuracy                           0.75       179
   macro avg       0.77      0.72      0.72       179
weighted avg       0.76      0.75      0.74       179



#### BernoulliNB 

In [107]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

y_pred_bnb = bnb.predict(X_test)

print("========  BernoulliNB  ========")
print("Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("Precision:", precision_score(y_test, y_pred_bnb))
print("Recall:", recall_score(y_test, y_pred_bnb))
print("F1-score:", f1_score(y_test, y_pred_bnb))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_bnb))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_bnb))

Accuracy: 0.7094972067039106
Precision: 0.7941176470588235
Recall: 0.375
F1-score: 0.5094339622641509

Confusion Matrix:
[[100   7]
 [ 45  27]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.93      0.79       107
           1       0.79      0.38      0.51        72

    accuracy                           0.71       179
   macro avg       0.74      0.65      0.65       179
weighted avg       0.73      0.71      0.68       179



#### GaussianNB 

In [105]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred_gnb = gnb.predict(X_test)

print("========  GaussianNB  ========")
print("Accuracy:", accuracy_score(y_test, y_pred_gnb))
print("Precision:", precision_score(y_test, y_pred_gnb))
print("Recall:", recall_score(y_test, y_pred_gnb))
print("F1-score:", f1_score(y_test, y_pred_gnb))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gnb))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_gnb))

Accuracy: 0.7262569832402235
Precision: 0.7674418604651163
Recall: 0.4583333333333333
F1-score: 0.5739130434782609

Confusion Matrix:
[[97 10]
 [39 33]]

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.91      0.80       107
           1       0.77      0.46      0.57        72

    accuracy                           0.73       179
   macro avg       0.74      0.68      0.69       179
weighted avg       0.74      0.73      0.71       179



**The best implementation for this case is MultinomialNB**

### Hyperparameter optimization

In [95]:
params = {
    "alpha": np.linspace(0.001, 1.0, 100),
    "fit_prior": [True, False]
}

#### Random Search

In [109]:
random_search = RandomizedSearchCV(
    MultinomialNB(),
    params,
    scoring="f1",
    cv=5,
    n_iter=50,
    random_state=10,
    n_jobs=-1
)
random_search.fit(X_train, y_train) # Entreno el optimizador con el dataset GANADOR

print("\n================ FINAL CONCLUSION ================")
print("Best params:", random_search.best_params_)
best_model_random = random_search.best_estimator_
print("The best model is: ", best_model_random)
print("The score for this model is: ", random_search.best_score_)


Best params: {'fit_prior': False, 'alpha': np.float64(0.09181818181818183)}
The best model is:  MultinomialNB(alpha=np.float64(0.09181818181818183), fit_prior=False)
The score for this model is:  0.736465731213953


In [111]:
best_model_random.fit(X_train, y_train)

y_pred_train = best_model_random.predict(X_train)

acc_train = accuracy_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
cm_train = confusion_matrix(y_train, y_pred_train)

print("========== TRAIN METRICS ==========")
print(f"Accuracy: {acc_train:.4f}")
print(f"F1-score: {f1_train:.4f}")
print(f"Precision: {prec_train:.4f}")
print(f"Recall: {rec_train:.4f}")
print("Confusion Matrix:\n", cm_train)
print("\nClassification Report:\n", classification_report(y_train, y_pred_train))

"""**************************************************************************************"""

y_pred_test = best_model_random.predict(X_test)

acc_test = accuracy_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
cm_test = confusion_matrix(y_test, y_pred_test)

print("========== TEST METRICS ==========")
print(f"Accuracy: {acc_test:.4f}")
print(f"F1-score: {f1_test:.4f}")
print(f"Precision: {prec_test:.4f}")
print(f"Recall: {rec_test:.4f}")
print("Confusion Matrix:\n", cm_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

Accuracy: 0.9902
F1-score: 0.9851
Precision: 0.9831
Recall: 0.9872
Confusion Matrix:
 [[473   4]
 [  3 232]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       477
           1       0.98      0.99      0.99       235

    accuracy                           0.99       712
   macro avg       0.99      0.99      0.99       712
weighted avg       0.99      0.99      0.99       712

Accuracy: 0.7654
F1-score: 0.6719
Precision: 0.7679
Recall: 0.5972
Confusion Matrix:
 [[94 13]
 [29 43]]

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.88      0.82       107
           1       0.77      0.60      0.67        72

    accuracy                           0.77       179
   macro avg       0.77      0.74      0.74       179
weighted avg       0.77      0.77      0.76       179



**Following hyperparameter optimization of the Multinomial Naive Bayes model, improved performance was observed across the test set. Specifically, the F1 score increased by approximately 6%, driven by a recall increase of over 13%, indicating the model's improved ability to correctly identify positive feedback. Although accuracy decreased slightly, the overall balance between accuracy and recall improved, justifying the optimization process.**