<a href="https://colab.research.google.com/github/Aiadevop/training_model_wine_quality/blob/main/DS_NL_Clasificaci%C3%B3n_SELECCI%C3%93N_DEL_MODELO_Predicci%C3%B3n_calidad_del_vino.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
url="https://raw.githubusercontent.com/Aiadevop/training_model_wine_quality/refs/heads/main/data/df_cl_final.csv"
df_cl = pd.read_csv(url)

In [2]:
# Divido los datos para test y entrenamiento.
from sklearn.model_selection import train_test_split

# Dividir el DataFrame completo. Test_size 20,80 y random_state es la semilla
df_train, df_test = train_test_split(
    df_cl, test_size=0.2, random_state=42, stratify=df_cl['quality']
)

# Luego extraer X e y de cada DataFrame
X_train = df_train.drop('quality', axis=1)
y_train = df_train['quality']

X_test = df_test.drop('quality', axis=1)
y_test = df_test['quality']

print(f"Tama√±o df_train: {df_train.shape}")
print(f"Tama√±o df_test: {df_test.shape}")
print(f"Distribuci√≥n train: {y_train.value_counts(normalize=True)}")
print(f"Distribuci√≥n test: {y_test.value_counts(normalize=True)}")

Tama√±o df_train: (1279, 12)
Tama√±o df_test: (320, 12)
Distribuci√≥n train: quality
1    0.534793
0    0.465207
Name: proportion, dtype: float64
Distribuci√≥n test: quality
1    0.534375
0    0.465625
Name: proportion, dtype: float64


In [3]:
# Creo un modelo base con LogisticRegression ya que es una clasificaci√≥n binaria.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

# Crear y entrenar el modelo baseline
print("üîÑ Entrenando Logistic Regression (baseline)...")
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)

#Hacer predicciones
y_pred_lr = lr.predict(X_test)
y_pred_proba_lr = lr.predict_proba(X_test)[:, 1]  # Probabilidades para clase 1

# Evaluar el modelo
print("\n" + "="*50)
print("üìä RESULTADOS BASELINE - LOGISTIC REGRESSION")
print("="*50)

accuracy = accuracy_score(y_test, y_pred_lr)
roc_auc = roc_auc_score(y_test, y_pred_proba_lr)

print(f"üéØ Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
print(f"üìà ROC-AUC: {roc_auc:.3f}")

# Si precision y recall tienen en 0 y 1 un valor similar significa que el modelo no esta sesgado para ninguna clase
print("\nüìã Reporte detallado:")
print(classification_report(y_test, y_pred_lr))

print("üî¢ Matriz de confusi√≥n:")
cm = confusion_matrix(y_test, y_pred_lr)
print(cm)

print("\nüìä Interpretaci√≥n de la matriz:")
print(f"Verdaderos Negativos (TN): {cm[0,0]}")
print(f"Falsos Positivos (FP): {cm[0,1]}")
print(f"Falsos Negativos (FN): {cm[1,0]}")
print(f"Verdaderos Positivos (TP): {cm[1,1]}")

# Variables m√°s importantes (coeficientes)
print("\nüîç Top 5 variables m√°s influyentes:")
feature_importance = pd.DataFrame({
    'variable': X_train.columns,
    'coeficiente': lr.coef_[0],
    'importancia_abs': abs(lr.coef_[0])
}).sort_values('importancia_abs', ascending=False)

print(feature_importance.head())

print(f"\n‚úÖ BASELINE ESTABLECIDO: {accuracy:.3f} accuracy - Esta es tu m√©trica a superar!")

üîÑ Entrenando Logistic Regression (baseline)...

üìä RESULTADOS BASELINE - LOGISTIC REGRESSION
üéØ Accuracy: 0.741 (74.1%)
üìà ROC-AUC: 0.824

üìã Reporte detallado:
              precision    recall  f1-score   support

           0       0.71      0.74      0.73       149
           1       0.77      0.74      0.75       171

    accuracy                           0.74       320
   macro avg       0.74      0.74      0.74       320
weighted avg       0.74      0.74      0.74       320

üî¢ Matriz de confusi√≥n:
[[111  38]
 [ 45 126]]

üìä Interpretaci√≥n de la matriz:
Verdaderos Negativos (TN): 111
Falsos Positivos (FP): 38
Falsos Negativos (FN): 45
Verdaderos Positivos (TP): 126

üîç Top 5 variables m√°s influyentes:
                variable  coeficiente  importancia_abs
10               alcohol     1.251893         1.251893
1       volatile_acidity    -0.818797         0.818797
6   total_sulfur_dioxide    -0.682987         0.682987
9              sulphates     0.538690    

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Voy a entrenar con distintos modelos a ver cual devuelve mejores m√©tricas.

modelos = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

resultados = []

for nombre, modelo in modelos.items():
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    y_proba = modelo.predict_proba(X_test)[:,1]

    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    resultados.append({
        "Modelo": nombre,
        "Accuracy": acc,
        "ROC-AUC": roc_auc
    })

# Ordenar resultados por ROC-AUC descendente
resultados = sorted(resultados, key=lambda x: x["ROC-AUC"], reverse=True)

print(f"{'Modelo':<20} {'Accuracy':<10} {'ROC-AUC':<10}")
print("-"*45)
for r in resultados:
    print(f"{r['Modelo']:<20} {r['Accuracy']:<10.3f} {r['ROC-AUC']:<10.3f}")


Modelo               Accuracy   ROC-AUC   
---------------------------------------------
Random Forest        0.806      0.903     
SVM                  0.741      0.832     
Logistic Regression  0.741      0.824     
KNN                  0.731      0.810     
Naive Bayes          0.722      0.788     
Decision Tree        0.759      0.758     


In [5]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Crear modelos con configuraci√≥n b√°sica
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
lgbm = LGBMClassifier(random_state=42, verbose=-1)

# Entrenar y predecir - XGBoost
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:,1]

# Entrenar y predecir - LightGBM
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)
y_proba_lgbm = lgbm.predict_proba(X_test)[:,1]

# Evaluar
acc_xgb = accuracy_score(y_test, y_pred_xgb)
roc_xgb = roc_auc_score(y_test, y_proba_xgb)

acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
roc_lgbm = roc_auc_score(y_test, y_proba_lgbm)

# Imprimir resultados
print("üìà RESULTADOS BOOSTING üìà\n")
print(f"XGBoost      - Accuracy: {acc_xgb:.3f}, ROC-AUC: {roc_xgb:.3f}")
print(f"LightGBM     - Accuracy: {acc_lgbm:.3f}, ROC-AUC: {roc_lgbm:.3f}")


üìà RESULTADOS BOOSTING üìà

XGBoost      - Accuracy: 0.825, ROC-AUC: 0.896
LightGBM     - Accuracy: 0.816, ROC-AUC: 0.890


In [6]:
# # Con estos datos se puede observar que el mejor modelo para el entrenamiento es RandomForest y estos dos √∫ltimos, voy a ajustar hiperpar√°metros en los tres modelos para ver con cual obtengo mejor resultado.