In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt


In [None]:
# Cargar dataset
dfCleaned = pd.read_csv('amazon_reviews_Cleaned.csv')

In [3]:
dfCleaned.head()

Unnamed: 0.1,Unnamed: 0,overall,reviewText
0,0,4.0,issues
1,1,5.0,purchased device worked advertised never much ...
2,2,4.0,works expected sprung higher capacity think ma...
3,3,5.0,think worked greathad diff bran gb card went s...
4,4,5.0,bought retail packaging arrived legit orange e...


In [None]:
# Apply TF-IDF 
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(dfCleaned['reviewText'])

# Define the target variable as integer labels
y = dfCleaned['overall'].astype(int)

In [5]:
print(X)

  (0, 1811)	1.0
  (1, 371)	0.4658817477827855
  (1, 4010)	0.3197017006470681
  (1, 2004)	0.25280027831644336
  (1, 980)	0.3515768967137597
  (1, 3614)	0.2634346235438177
  (1, 2124)	0.16342700659146378
  (1, 2527)	0.15894729456975054
  (1, 2235)	0.22880268690799568
  (1, 2283)	0.23746924750824877
  (1, 65)	0.3079511880305262
  (1, 4814)	0.22041352975002462
  (1, 901)	0.2500956641441643
  (1, 2960)	0.2390691997557985
  (2, 580)	0.3599249077638274
  (2, 1995)	0.2724857527031145
  (2, 2453)	0.37010345303225173
  (2, 4490)	0.2923424981405928
  (2, 1032)	0.32830745830827224
  (2, 320)	0.22808251479726946
  (2, 2031)	0.22707996152754703
  (2, 4235)	0.20417737012953208
  (2, 460)	0.18410928375398103
  (2, 1617)	0.26622741443620596
  (2, 3834)	0.4017209024878068
  :	:
  (4912, 1308)	0.4263389161515658
  (4912, 3723)	0.28806210890236555
  (4912, 4566)	0.36638481557545366
  (4912, 1490)	0.23531007146235436
  (4912, 4010)	0.41436370746172485
  (4913, 3911)	0.4182065391329696
  (4913, 765)	0.39895

In [6]:
print(y)

0       4
1       5
2       4
3       5
4       5
       ..
4909    1
4910    5
4911    5
4912    5
4913    5
Name: overall, Length: 4914, dtype: int64


In [None]:
# Separar en Train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Modelos (algunos One-vs-Rest)
models = {
    "Logistic Regression (OvR)": OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM (OvR)": OneVsRestClassifier(SVC(probability=True)),
    "K-NN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Bernoulli Naive Bayes": BernoulliNB(),
    # "Gaussian Naive Bayes": GaussianNB(), # Gaussian Naive Bayes me daba problemas, por esto esta comentado
    "Multinomial Naive Bayes": MultinomialNB()
}

In [None]:
# Entrenar y evaluar modelos
results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": report['weighted avg']['precision'],
        "Recall": report['weighted avg']['recall'],
        "F1 Score": report['weighted avg']['f1-score']
    }
    print(f"\nResults for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Results for Logistic Regression (OvR):
Accuracy: 0.8098
              precision    recall  f1-score   support

           1       0.47      0.16      0.24        44
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        30
           4       0.00      0.00      0.00       107
           5       0.82      1.00      0.90       790

    accuracy                           0.81       983
   macro avg       0.26      0.23      0.23       983
weighted avg       0.68      0.81      0.73       983

Confusion Matrix:
 [[  7   0   0   0  37]
 [  3   0   0   0   9]
 [  3   0   0   0  27]
 [  1   0   0   0 106]
 [  1   0   0   0 789]]

Results for Decision Tree:
Accuracy: 0.7284
              precision    recall  f1-score   support

           1       0.32      0.36      0.34        44
           2       0.17      0.17      0.17        12
           3       0.10      0.07      0.08        30
           4       0.14      0.11      0.12       107
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Results for Random Forest:
Accuracy: 0.8026
              precision    recall  f1-score   support

           1       0.20      0.02      0.04        44
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        30
           4       0.00      0.00      0.00       107
           5       0.81      1.00      0.89       790

    accuracy                           0.80       983
   macro avg       0.20      0.20      0.19       983
weighted avg       0.66      0.80      0.72       983

Confusion Matrix:
 [[  1   0   0   0  43]
 [  2   0   0   0  10]
 [  1   0   0   0  29]
 [  1   0   0   0 106]
 [  0   0   0   2 788]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Results for SVM (OvR):
Accuracy: 0.8138
              precision    recall  f1-score   support

           1       0.59      0.23      0.33        44
           2       0.00      0.00      0.00        12
           3       1.00      0.03      0.06        30
           4       0.00      0.00      0.00       107
           5       0.82      1.00      0.90       790

    accuracy                           0.81       983
   macro avg       0.48      0.25      0.26       983
weighted avg       0.71      0.81      0.74       983

Confusion Matrix:
 [[ 10   0   0   0  34]
 [  4   0   0   0   8]
 [  2   0   1   0  27]
 [  1   0   0   0 106]
 [  0   0   0   1 789]]

Results for K-NN:
Accuracy: 0.7935
              precision    recall  f1-score   support

           1       0.50      0.27      0.35        44
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        30
           4       0.10      0.03      0.04       107
           5       0.83     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
