In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Crear directorio para guardar los gráficos
os.makedirs('hyperParams2Balanced_2apply2OriginalDataset_plots', exist_ok=True)

# Archivo para guardar las salidas de texto
output_file = open('hyperParams2Balanced_2apply2OriginalDataset.txt', 'w')

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")

print("Path to dataset files:", path)

Path to dataset files: /Users/lianbaguebatlle/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2


In [4]:
import pandas as pd

# Specify the full path to the dataset CSV file
csv_file_path = f"{path}/Reviews.csv"

# Load the dataset into a DataFrame
dfBig = pd.read_csv(csv_file_path)

# Display the first few rows of the dataset
print(dfBig.head())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [5]:
#Quitamos columnas innecesarias de nuevo
dfSimple = dfBig.drop(columns=['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'])
dfSimple.head()

Unnamed: 0,Id,Score,Text
0,1,5,I have bought several of the Vitality canned d...
1,2,1,Product arrived labeled as Jumbo Salted Peanut...
2,3,4,This is a confection that has been around a fe...
3,4,2,If you are looking for the secret ingredient i...
4,5,5,Great taffy at a great price. There was a wid...


In [6]:
"""WITHOUT LEMMATIZATION"""

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# NLTK resources
nltk.download('stopwords') #Llista de stopwords + comunes
nltk.download('punkt')


def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Split text into words
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Join words back into a single string
    return ' '.join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lianbaguebatlle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lianbaguebatlle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords') #Llista de stopwords + comunes
nltk.download('punkt')

def clean_text(text):
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Split text into words
    words = text.split()
    # Remove stopwords and apply lemmatization
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join words back into a single string
    return ' '.join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lianbaguebatlle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lianbaguebatlle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
#Hacemos las mismas limpiezas que antes
dfSimple.dropna(subset=["Text"], inplace=True)
dfSimple['Text'] = dfSimple['Text'].apply(clean_text)
dfSimple.head()

Unnamed: 0,Id,Score,Text
0,1,5,bought several vitality canned dog food produc...
1,2,1,product arrived labeled jumbo salted peanutsth...
2,3,4,confection around centuries light pillowy citr...
3,4,2,looking secret ingredient robitussin believe f...
4,5,5,great taffy great price wide assortment yummy ...


In [8]:
dfBigBinary = dfSimple.copy()
dfBigBinary['Score'] = dfBigBinary['Score'].apply(lambda x: 1 if x > 2.5 else 0)
dfBigBinary.head()

Unnamed: 0,Id,Score,Text
0,1,1,bought several vitality canned dog food produc...
1,2,0,product arrived labeled jumbo salted peanutsth...
2,3,1,confection around centuries light pillowy citr...
3,4,0,looking secret ingredient robitussin believe f...
4,5,1,great taffy great price wide assortment yummy ...


In [9]:
# A PARTIR D'AQUI ES LO DE BALANCED
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(dfBigBinary['Text'])
y = dfBigBinary['Score']

In [10]:
# Crear un conjunto de datos balanceado
df_positive = dfBigBinary[dfBigBinary['Score'] == 1]
df_negative = dfBigBinary[dfBigBinary['Score'] == 0]
df_balanced = pd.concat([df_positive.sample(len(df_negative), random_state=42), df_negative])

X_balanced = tfidf.transform(df_balanced['Text'])
y_balanced = df_balanced['Score']

# División de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)


In [11]:
# Print sizes of training and test sets
print(f"Training Set Size: {X_train.shape[0]}")
print(f"Test Set Size: {X_test.shape[0]}")

# Calculate and print class distributions
train_positive_percentage = (y_train[y_train == 1].count() / y_train.shape[0]) * 100
train_negative_percentage = (y_train[y_train == 0].count() / y_train.shape[0]) * 100

test_positive_percentage = (y_test[y_test == 1].count() / y_test.shape[0]) * 100
test_negative_percentage = (y_test[y_test == 0].count() / y_test.shape[0]) * 100

print("\nTraining Set Class Distribution:")
print(f"Positives (overall = 1): {train_positive_percentage:.2f}%")
print(f"Negatives (overall = 0): {train_negative_percentage:.2f}%")

print("\nTest Set Class Distribution:")
print(f"Positives (overall = 1): {test_positive_percentage:.2f}%")
print(f"Negatives (overall = 0): {test_negative_percentage:.2f}%")

Training Set Size: 131259
Test Set Size: 32815

Training Set Class Distribution:
Positives (overall = 1): 50.10%
Negatives (overall = 0): 49.90%

Test Set Class Distribution:
Positives (overall = 1): 49.59%
Negatives (overall = 0): 50.41%


In [25]:
# Definir los hiperparámetros a buscar para cada modelo
param_grid = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],
        'solver': ['newton-cg', 'lbfgs']
    },
    "Random Forest": {
        'n_estimators': [50, 100],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [None, 10, 20],
        'criterion': ['gini']
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean']
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }
}

In [26]:
# Crear los modelos base
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    #"Naive Bayes (Multinomial)": MultinomialNB(),
    #"Naive Bayes (Bernoulli)": BernoulliNB(),
    #"Naive Bayes (Gaussian)": GaussianNB(),
    # "Decision Tree": DecisionTreeClassifier(),
    # "SVM": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [29]:

# Buscar los mejores hiperparámetros
best_params = {}
for model_name, model in models.items():
    print(f"Buscando mejores hiperparámetros para: {model_name}\n")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid[model_name], cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_params[model_name] = grid_search.best_params_
output_file.write(f"{best_params}\n")


Buscando mejores hiperparámetros para: Logistic Regression

Buscando mejores hiperparámetros para: Random Forest



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lianbaguebatlle/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lianbaguebatlle/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/lianbaguebatlle/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/lianbaguebatlle/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_para

Buscando mejores hiperparámetros para: K-Nearest Neighbors

Buscando mejores hiperparámetros para: Gradient Boosting



332

In [30]:
# Definir modelos con los mejores hiperparámetros
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, **best_params["Logistic Regression"]),
    "Random Forest": RandomForestClassifier(**best_params["Random Forest"]),
    "Naive Bayes (Multinomial)": MultinomialNB(),
    "Naive Bayes (Bernoulli)": BernoulliNB(),
    #"Naive Bayes (Gaussian)": GaussianNB(),
    #"Decision Tree": DecisionTreeClassifier(),
    #"SVM": SVC(probability=True, **best_params["SVM"]),
    "K-Nearest Neighbors": KNeighborsClassifier(**best_params["K-Nearest Neighbors"]),
    "Gradient Boosting": GradientBoostingClassifier(**best_params["Gradient Boosting"])
}

In [31]:
# Diccionario para almacenar los resultados después del ajuste de hiperparámetros
results_after = {}

for model_name, model in models.items():
    output_file.write(f"Entrenando modelo con mejores hiperparámetros: {model_name}\n")
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Predecir en el conjunto de prueba
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    # Evaluar el rendimiento
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    # Guardar los resultados
    results_after[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "FPR": fpr,
        "TPR": tpr
    }

In [12]:
dfSimpleBinary = pd.read_csv('amazon_reviews_SimpleBinary.csv')

In [13]:
dfSimpleBinary.head()

Unnamed: 0.1,Unnamed: 0,overall,reviewText
0,0,1,issues
1,1,1,purchased device worked advertised never much ...
2,2,1,works expected sprung higher capacity think ma...
3,3,1,think worked greathad diff bran gb card went s...
4,4,1,bought retail packaging arrived legit orange e...


In [14]:
# Apply TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for simplicity
X_SimpleBinary = tfidf.fit_transform(dfSimpleBinary['reviewText'])  

# y = Etiqueta
y_SimpleBinary = dfSimpleBinary['overall']  

In [39]:
output_file.write("\nResultados en el dataset dfSimpleBinary:\n")
print("\nResultados en el dataset dfSimple:")


Resultados en el dataset dfSimple:


In [40]:
results = {}

for model_name, model in models.items():
    print(f"Aplicando modelo: {model_name}")
    output_file.write(f"Aplicando modelo: {model_name}\n")
    
    # Predict labels and probabilities
    y_pred_new = model.predict(X_SimpleBinary)
    print(f"{model_name} prediction distribution:")
    print(pd.Series(y_pred_new).value_counts())

    y_pred_prob_new = model.predict_proba(X_SimpleBinary)[:, 1]
    
    # If labels are available, evaluate performance
    accuracy = accuracy_score(y_SimpleBinary, y_pred_new)
    precision = precision_score(y_SimpleBinary, y_pred_new, average='binary')
    recall = recall_score(y_SimpleBinary, y_pred_new, average='binary')
    f1 = f1_score(y_SimpleBinary, y_pred_new, average='binary')
    output_file.write(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "FPR": fpr,
        "TPR": tpr
    }


Aplicando modelo: Logistic Regression
Logistic Regression prediction distribution:
0    2688
1    2226
Name: count, dtype: int64
Accuracy: 0.4375, Precision: 0.9102, Recall: 0.4414, F1 Score: 0.5945
Aplicando modelo: Random Forest
Random Forest prediction distribution:
1    3683
0    1231
Name: count, dtype: int64
Accuracy: 0.7352, Precision: 0.9465, Recall: 0.7595, F1 Score: 0.8427
Aplicando modelo: Naive Bayes (Multinomial)
Naive Bayes (Multinomial) prediction distribution:
1    2969
0    1945
Name: count, dtype: int64
Accuracy: 0.5818, Precision: 0.9269, Recall: 0.5996, F1 Score: 0.7281
Aplicando modelo: Naive Bayes (Bernoulli)
Naive Bayes (Bernoulli) prediction distribution:
1    3541
0    1373
Name: count, dtype: int64
Accuracy: 0.6925, Precision: 0.9348, Recall: 0.7211, F1 Score: 0.8142
Aplicando modelo: K-Nearest Neighbors
K-Nearest Neighbors prediction distribution:
1    4908
0       6
Name: count, dtype: int64
Accuracy: 0.9328, Precision: 0.9340, Recall: 0.9987, F1 Score: 0.96

In [None]:
# SALE MAL, PROVO NOMES AMB RANDOM FOREST

In [15]:
# Best hyperparameters from Grid Search
best_random_forest_params = {
    'criterion': 'gini',
    'max_depth': None,
    'max_features': 'sqrt',
    'n_estimators': 100
}

# Initialize the Random Forest model with the best hyperparameters
random_forest = RandomForestClassifier(**best_random_forest_params, random_state=42)


In [16]:
# Train the Random Forest model
print("Training Random Forest...")
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest.predict(X_test)
print(f"Prediction distribution:")
print(pd.Series(y_pred).value_counts())

y_pred_prob = random_forest.predict_proba(X_test)[:, 1]

Training Random Forest...
Prediction distribution:
0    16449
1    16366
Name: count, dtype: int64


In [44]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

# Print evaluation metrics
print("Random Forest Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Random Forest Evaluation:
Accuracy: 0.8831
Precision: 0.8822
Recall: 0.8820
F1 Score: 0.8821
ROC AUC: 0.9561


In [None]:
y_pred_new = random_forest.predict(X_SimpleBinary)
print(f"{model_name} prediction distribution:")
print(pd.Series(y_pred_new).value_counts())
y_pred_prob_new = random_forest.predict_proba(X_SimpleBinary)[:, 1]

Gradient Boosting prediction distribution:
1    3588
0    1326
Name: count, dtype: int64


In [46]:
# Evaluate Model Performance on New Dataset
accuracy_new = accuracy_score(y_SimpleBinary, y_pred_new)
precision_new = precision_score(y_SimpleBinary, y_pred_new)
recall_new = recall_score(y_SimpleBinary, y_pred_new)
f1_new = f1_score(y_SimpleBinary, y_pred_new)
fpr_new, tpr_new, _ = roc_curve(y_SimpleBinary, y_pred_prob_new)
roc_auc_new = auc(fpr_new, tpr_new)

# Print Evaluation Metrics
print("\nRandom Forest Evaluation on dfSimpleBinary:")
print(f"Accuracy: {accuracy_new:.4f}")
print(f"Precision: {precision_new:.4f}")
print(f"Recall: {recall_new:.4f}")
print(f"F1 Score: {f1_new:.4f}")
print(f"ROC AUC: {roc_auc_new:.4f}")


Random Forest Evaluation on dfSimpleBinary:
Accuracy: 0.7200
Precision: 0.9479
Recall: 0.7410
F1 Score: 0.8317
ROC AUC: 0.6282
