### Imported libreries

In [12]:
# Data manipulation

import pandas as pd
import numpy as np

# Model preparating 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


# Modeling

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Warning manage

import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message="Inconsistent values: penalty=l1 with l1_ratio=0.0")
warnings.filterwarnings("ignore", category=UserWarning)


### Data cleaning

In [13]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

df.to_excel("../data/raw/df.xlsx", index = False)

df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [14]:
rows, columns = df.shape
print(f"The dimensions of this dataset are {rows} rows and {columns} columns")

The dimensions of this dataset are 891 rows and 3 columns


In [15]:
df.drop(["package_name"], axis= 1, inplace= True)
rows, columns = df.shape
df["review"] = df["review"].str.strip().str.lower()
print(f"The dimensions of this dataset are {rows} rows and {columns} columns")

The dimensions of this dataset are 891 rows and 2 columns


In [16]:
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


### Model preparating

In [17]:
x = df["review"]
y = df["polarity"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 17)

x_train.head()

873    such an awesome game love it a really fun game...
828                    dami xa hajur harule ni hernu hai
99     updated version is down not able to sent conne...
523    rubbish the amount of memory it gives for free...
132    every time i play the moon struck game, it doe...
Name: review, dtype: str

In [18]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(x_train).toarray()
X_test = vec_model.transform(x_test).toarray()

In [None]:
models = {
    "GaussianNB": GaussianNB(),
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB()
}

def build_and_evaluate(models):

    results = []

    for name, model in models.items():

        model.fit(X_train, y_train)

        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        results.append({
            "Model": name,

            "Train_Accuracy": accuracy_score(y_train, y_pred_train),
            "Test_Accuracy": accuracy_score(y_test, y_pred_test),

            "Train_Precision": precision_score(y_train, y_pred_train),
            "Test_Precision": precision_score(y_test, y_pred_test),

            "Train_Recall": recall_score(y_train, y_pred_train),
            "Test_Recall": recall_score(y_test, y_pred_test),

            "Train_F1": f1_score(y_train, y_pred_train),
            "Test_F1": f1_score(y_test, y_pred_test)
        })

    return pd.DataFrame(results)

# Results comparison

df_results = build_and_evaluate(models)
df_results = df_results.sort_values(by="Test_F1", ascending=False)

print("====== MODEL COMPARISON ======")
print(df_results)

# Best model

best_model_row = df_results.iloc[0]
best_model_name = best_model_row["Model"]
best_model = models[best_model_name]

print("\n====== BEST MODEL BASED ON TEST F1 ======")
print(best_model_row)

# Final evaluation

best_model.fit(X_train, y_train)
y_pred_final = best_model.predict(X_test)

print("\n====== FINAL CONFUSION MATRIX ======")
print(confusion_matrix(y_test, y_pred_final))

print("\n====== FINAL CLASSIFICATION REPORT ======")
print(classification_report(y_test, y_pred_final))

           Model  Train_Accuracy  Test_Accuracy  Train_Precision  \
1  MultinomialNB        0.963483       0.754190         0.977169   
0     GaussianNB        0.988764       0.726257         0.967078   
2    BernoulliNB        0.915730       0.709497         0.983425   

   Test_Precision  Train_Recall  Test_Recall  Train_F1   Test_F1  
1        0.791667      0.910638     0.527778  0.942731  0.633333  
0        0.767442      1.000000     0.458333  0.983264  0.573913  
2        0.794118      0.757447     0.375000  0.855769  0.509434  

Model              MultinomialNB
Train_Accuracy          0.963483
Test_Accuracy            0.75419
Train_Precision         0.977169
Test_Precision          0.791667
Train_Recall            0.910638
Test_Recall             0.527778
Train_F1                0.942731
Test_F1                 0.633333
Name: 1, dtype: object

[[97 10]
 [34 38]]

              precision    recall  f1-score   support

           0       0.74      0.91      0.82       107
        

### Hyperparameter optimization

In [23]:
params = {
    "alpha": np.linspace(0.001, 1.0, 100),
    "fit_prior": [True, False]
}

#### Random Search

In [24]:
random_search = RandomizedSearchCV(
    MultinomialNB(),
    params,
    scoring="f1",
    cv=5,
    n_iter=50,
    random_state=10,
    n_jobs=-1
)
random_search.fit(X_train, y_train) # Entreno el optimizador con el dataset GANADOR

print("\n================ FINAL CONCLUSION ================")
print("Best params:", random_search.best_params_)
best_model_random = random_search.best_estimator_
print("The best model is: ", best_model_random)
print("The score for this model is: ", random_search.best_score_)


Best params: {'fit_prior': False, 'alpha': np.float64(0.09181818181818183)}
The best model is:  MultinomialNB(alpha=np.float64(0.09181818181818183), fit_prior=False)
The score for this model is:  0.736465731213953


In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):

    model.fit(X_train, y_train)

    datasets = {
        "TRAIN": (X_train, y_train),
        "TEST": (X_test, y_test)
    }

    for name, (X, y) in datasets.items():
        y_pred = model.predict(X)
        print(f"\n========== {name} METRICS ==========")
        print(f"Accuracy: {accuracy_score(y, y_pred):.4f}")
        print(f"F1-score: {f1_score(y, y_pred):.4f}")
        print(f"Precision: {precision_score(y, y_pred):.4f}")
        print(f"Recall: {recall_score(y, y_pred):.4f}")
        print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
        print("\nClassification Report:\n", classification_report(y, y_pred))

evaluate_model(best_model_random, X_train, y_train, X_test, y_test)


Accuracy: 0.9902
F1-score: 0.9851
Precision: 0.9831
Recall: 0.9872
Confusion Matrix:
 [[473   4]
 [  3 232]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       477
           1       0.98      0.99      0.99       235

    accuracy                           0.99       712
   macro avg       0.99      0.99      0.99       712
weighted avg       0.99      0.99      0.99       712


Accuracy: 0.7654
F1-score: 0.6719
Precision: 0.7679
Recall: 0.5972
Confusion Matrix:
 [[94 13]
 [29 43]]

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.88      0.82       107
           1       0.77      0.60      0.67        72

    accuracy                           0.77       179
   macro avg       0.77      0.74      0.74       179
weighted avg       0.77      0.77      0.76       179



**Following hyperparameter optimization of the Multinomial Naive Bayes model, improved performance was observed across the test set. Specifically, the F1 score increased by approximately 6%, driven by a recall increase of over 13%, indicating the model's improved ability to correctly identify positive feedback. Although accuracy decreased slightly, the overall balance between accuracy and recall improved, justifying the optimization process.**