# Explore here

In [13]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

df.drop(columns=["package_name"], inplace=True)

df["review"] = df["review"].str.strip().str.lower()

X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# convertir a array denso para Gaussiana
X_train_dense = X_train_vec.toarray()
X_test_dense = X_test_vec.toarray()

results = {}

mnb = MultinomialNB()
mnb.fit(X_train_vec, y_train)

y_pred_train_mnb = mnb.predict(X_train_vec)
results['MultinomialNB_train'] = {
    'Accuracy': accuracy_score(y_train, y_pred_train_mnb),
    'Classification Report': classification_report(y_train, y_pred_train_mnb)
}

y_pred_test_mnb = mnb.predict(X_test_vec)
results['MultinomialNB_test'] = {
    'Accuracy': accuracy_score(y_test, y_pred_test_mnb),
    'Classification Report': classification_report(y_test, y_pred_test_mnb)
}

gnb = GaussianNB()
gnb.fit(X_train_dense, y_train)

y_pred_train_gnb = gnb.predict(X_train_dense)
results['GaussianNB_train'] = {
    'Accuracy': accuracy_score(y_train, y_pred_train_gnb),
    'Classification Report': classification_report(y_train, y_pred_train_gnb)
}

y_pred_test_gnb = gnb.predict(X_test_dense)
results['GaussianNB_test'] = {
    'Accuracy': accuracy_score(y_test, y_pred_test_gnb),
    'Classification Report': classification_report(y_test, y_pred_test_gnb)
}

bernoulli = BernoulliNB()
bernoulli.fit(X_train_vec, y_train)

y_pred_train_bernoulli = bernoulli.predict(X_train_vec)
results['BernoulliNB_train'] = {
    'Accuracy': accuracy_score(y_train, y_pred_train_bernoulli),
    'Classification Report': classification_report(y_train, y_pred_train_bernoulli)
}


y_pred_test_bernoulli = bernoulli.predict(X_test_vec)
results['BernoulliNB_test'] = {
    'Accuracy': accuracy_score(y_test, y_pred_test_bernoulli),
    'Classification Report': classification_report(y_test, y_pred_test_bernoulli)
}

for model in results:
    print(f"\nResultados para {model}:")
    print(f"Accuracy: {results[model]['Accuracy']}")
    print(f"Clasificación:\n{results[model]['Classification Report']}")




Resultados para MultinomialNB_train:
Accuracy: 0.9606741573033708
Clasificación:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       458
           1       0.96      0.93      0.94       254

    accuracy                           0.96       712
   macro avg       0.96      0.95      0.96       712
weighted avg       0.96      0.96      0.96       712


Resultados para MultinomialNB_test:
Accuracy: 0.8156424581005587
Clasificación:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179


Resultados para GaussianNB_train:
Accuracy: 0.9859550561797753
Clasificación:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      

Pensé que el mejor sería el Bernoulli porque tenemos predecir una respuesta binaria, pero claro, al trabajar con texto vectorizado, entiendo que es Clasificación con características que representan conteos o frecuencias discretas.

Resultados para MultinomialNB_test:
Accuracy: 0.8156424581005587
Clasificación:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179

Resultados para GaussianNB_test:
Accuracy: 0.8044692737430168
Clasificación:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179

In [14]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

#  Crear el VotingClassifier con ambos modelos
voting_clf = VotingClassifier(estimators=[('mnb', mnb), ('rf', rf)], voting='soft')

#  Entrenar el VotingClassifier
voting_clf.fit(X_train_vec, y_train)

# Evaluar el modelo en el conjunto de prueba
y_pred_test_voting = voting_clf.predict(X_test_vec)
print("\nResultados para VotingClassifier - Test:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test_voting)}")
print(f"Clasificación:\n{classification_report(y_test, y_pred_test_voting)}")

# Evaluación en el conjunto de entrenamiento
y_pred_train_voting = voting_clf.predict(X_train_vec)
print("\nResultados para VotingClassifier - Train:")
print(f"Accuracy: {accuracy_score(y_train, y_pred_train_voting)}")
print(f"Clasificación:\n{classification_report(y_train, y_pred_train_voting)}")




Resultados para VotingClassifier - Test:
Accuracy: 0.8100558659217877
Clasificación:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87       126
           1       0.69      0.64      0.67        53

    accuracy                           0.81       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.81      0.81      0.81       179


Resultados para VotingClassifier - Train:
Accuracy: 0.9705056179775281
Clasificación:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       458
           1       0.97      0.95      0.96       254

    accuracy                           0.97       712
   macro avg       0.97      0.97      0.97       712
weighted avg       0.97      0.97      0.97       712



In [15]:
from sklearn.model_selection import RandomizedSearchCV

# 7. Optimización de MultinomialNB con RandomizedSearchCV
param_dist = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0], 
}

random_search_mnb = RandomizedSearchCV(mnb, param_distributions=param_dist, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)

random_search_mnb.fit(X_train_vec, y_train)

print("Mejores parámetros encontrados para MultinomialNB:", random_search_mnb.best_params_)

best_mnb = random_search_mnb.best_estimator_

y_pred_mnb_test = best_mnb.predict(X_test_vec)
print("\nResultados para MultinomialNB optimizado - Test:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mnb_test)}")
print(f"Clasificación:\n{classification_report(y_test, y_pred_mnb_test)}")

y_pred_mnb_train = best_mnb.predict(X_train_vec)
print("\nResultados para MultinomialNB optimizado - Train:")
print(f"Accuracy: {accuracy_score(y_train, y_pred_mnb_train)}")
print(f"Clasificación:\n{classification_report(y_train, y_pred_mnb_train)}")

Fitting 3 folds for each of 5 candidates, totalling 15 fits




[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=1.0; total time=   0.0s
[CV] END ..........................................alpha=1.0; total time=   0.0s
[CV] END ..........................................alpha=1.0; total time=   0.0s
[CV] END ..........................................alpha=2.0; total time=   0.0s
[CV] END ..........................................alpha=2.0; total time=   0.0s
[CV] END ..........................................alpha=2.0; total time=   0.0s
[CV] END ...................

In [16]:
import pickle  

voting_clf_opt = VotingClassifier(estimators=[('mnb', best_mnb), ('rf', rf)], voting='soft')
voting_clf_opt.fit(X_train_vec, y_train)

with open("/workspaces/Finarosalina_Bayes_bueno_MlL/models/modelo_voting_classifier_opt.pkl", "wb") as f:
    pickle.dump(voting_clf_opt, f)


In [17]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)


df = df.drop(columns=['package_name'])

df["review"] = df["review"].str.strip().str.lower()

X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


X_train_dense = X_train_vec.toarray()
X_test_dense = X_test_vec.toarray()


xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train_dense, y_train)

y_pred_test_xgb = xgb_model.predict(X_test_dense)
print("\nResultados para XGBoost - Test:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test_xgb)}")
print(f"Clasificación:\n{classification_report(y_test, y_pred_test_xgb)}")


y_pred_train_xgb = xgb_model.predict(X_train_dense)
print("\nResultados para XGBoost - Train:")
print(f"Accuracy: {accuracy_score(y_train, y_pred_train_xgb)}")
print(f"Clasificación:\n{classification_report(y_train, y_pred_train_xgb)}")



Resultados para XGBoost - Test:
Accuracy: 0.8100558659217877
Clasificación:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       126
           1       0.68      0.68      0.68        53

    accuracy                           0.81       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.81      0.81      0.81       179


Resultados para XGBoost - Train:
Accuracy: 0.9648876404494382
Clasificación:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       458
           1       0.98      0.92      0.95       254

    accuracy                           0.96       712
   macro avg       0.97      0.95      0.96       712
weighted avg       0.97      0.96      0.96       712



no ha mejorado, pero hay que ajustar hiperparametros, porque con este metodo ha aumentado el overfitting, especialmetne el recall en el train

In [18]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}


xgb_model = xgb.XGBClassifier(random_state=42)

# Ajuste de hiperparámetros con GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_dense, y_train)


print("Mejores parámetros encontrados para XGBoost:", grid_search.best_params_)


best_xgb = grid_search.best_estimator_

y_pred_test_xgb = best_xgb.predict(X_test_dense)
print("\nResultados para XGBoost optimizado - Test:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test_xgb)}")
print(f"Clasificación:\n{classification_report(y_test, y_pred_test_xgb)}")


y_pred_train_xgb = best_xgb.predict(X_train_dense)
print("\nResultados para XGBoost optimizado - Train:")
print(f"Accuracy: {accuracy_score(y_train, y_pred_train_xgb)}")
print(f"Clasificación:\n{classification_report(y_train, y_pred_train_xgb)}")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.6s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.9s
[CV] END 

pues ha empeorado además de requerir tiempo de computación alto, quizás con una regresión o con otro boost pueda mejorar.

In [19]:
import numpy as np

# Guardar X_train_dense y X_test_dense como archivos CSV
np.savetxt('/workspaces/Finarosalina_Bayes_bueno_MlL/data/processed/X_train.csv', X_train_dense, delimiter=',')
np.savetxt('/workspaces/Finarosalina_Bayes_bueno_MlL/data/processed/X_test.csv', X_test_dense, delimiter=',')

np.savetxt('/workspaces/Finarosalina_Bayes_bueno_MlL/data/processed/y_train.csv', y_train, delimiter=',')
np.savetxt('/workspaces/Finarosalina_Bayes_bueno_MlL/data/processed/y_test.csv', y_test, delimiter=',')


In [20]:
import nbformat

# Cargar el archivo .ipynb
notebook_path = '/workspaces/Finarosalina_Bayes_bueno_MlL/src/explore.ipynb'
with open(notebook_path, 'r') as f:
    notebook_content = nbformat.read(f, as_version=4)

# Extraer las celdas de código
code_cells = [cell['source'] for cell in notebook_content.cells if cell.cell_type == 'code']

# Guardar el código en un archivo .py
output_path = '/workspaces/Finarosalina_Bayes_bueno_MlL/src/app.py'
with open(output_path, 'w') as f:
    for code in code_cells:
        f.write(code + '\n')
