## TUNNING DEL MEJOR MODELO


In [1]:
# ============================================================
# üß∞ LIBRER√çAS PRINCIPALES
# ============================================================
import os
import math
import warnings
from datetime import datetime

# ============================================================
# üìä MANEJO Y AN√ÅLISIS DE DATOS
# ============================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ============================================================
# ü§ñ MACHINE LEARNING - MODELOS
# ============================================================
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree

# ============================================================
# üìà M√âTRICAS Y EVALUACI√ìN DE MODELOS
# ============================================================
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    roc_curve,
    auc,
    roc_auc_score
)

from itertools import cycle


warnings.filterwarnings('ignore')

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## CARGA DE DATOS - PREPARACI√ìN DE VARIABLES y LABEL ENCODER

In [2]:
df = pd.read_csv('../data/processed/04_dataset_filtrado.csv')

# Convertir columnas a n√∫meros enteros
columnas_a_convertir = ['popularidad', 'a√±o', 'key', 'mode']

for columna in columnas_a_convertir:
    if columna in df.columns:
        # Primero rellenamos los NaN con 0 (o puedes usar otro valor)
        df[columna] = df[columna].fillna(0).astype('int64')

# Verificar los cambios
df[columnas_a_convertir].info()

features_audio = ['popularidad','acousticness', 'danceability', 'energy', 'instrumentalness',
                  'key', 'liveness', 'loudness', 'mode', 'speechiness',
                  'tempo', 'valence']

# Separar caracter√≠sticas (X) y objetivo (y)
X = df[features_audio]
y = df['genero']

# Codificar las etiquetas de g√©nero
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Mostrar la distribuci√≥n con nombres de g√©neros
generos = label_encoder.classes_


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 743 entries, 0 to 742
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   popularidad  743 non-null    int64
 1   a√±o          743 non-null    int64
 2   key          743 non-null    int64
 3   mode         743 non-null    int64
dtypes: int64(4)
memory usage: 23.3 KB


## DIVISI√ìN DE DATOS y NORMALIZACI√ìN DE DATOS

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Normalizaci√≥n completada")

Normalizaci√≥n completada


## ENTRENAMIENTO DE MODELOS

In [4]:
print("ENTRENAMIENTO DE MODELOS")
# Diccionario para almacenar resultados
resultados = {}


# ---------- MODELO 1: K-NEAREST NEIGHBORS (KNN) ----------
print("\nModelo 1: K-Nearest Neighbors (KNN)")
print("-" * 50)

# Buscar el mejor valor de k
print("Buscando el mejor valor de k...")
k_values = range(1, 31)
cv_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

best_k = k_values[np.argmax(cv_scores)]
print(f" Mejor k encontrado: {best_k} (accuracy: {max(cv_scores):.4f})")

# Entrenar KNN con el mejor k
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train_scaled, y_train)
y_pred_knn = knn_best.predict(X_test_scaled)

# M√©tricas
acc_knn = accuracy_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn, average='weighted')

resultados['KNN'] = {
    'modelo': knn_best,
    'accuracy': acc_knn,
    'f1_score': f1_knn,
    'predictions': y_pred_knn
}

print(f" Accuracy: {acc_knn:.4f}")
print(f" F1-Score: {f1_knn:.4f}")



# ---------- MODELO 2: Decision Tress ----------
print("\n Modelo 2: Decision Tress")
print("-" * 50)

dt = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)

acc_dt = accuracy_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt, average='weighted')

resultados['Decision Tress'] = {
    'modelo': dt,
    'accuracy': acc_dt,
    'f1_score': f1_dt,
    'predictions': y_pred_dt
}

print(f" Accuracy: {acc_dt:.4f}")
print(f" F1-Score: {f1_dt:.4f}")



# ---------- MODELO 2b: RANDOM FOREST ----------
print("\n Modelo 2b Random Forest")
print("-" * 50)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

resultados['Random Forest'] = {
    'modelo': rf,
    'accuracy': acc_rf,
    'f1_score': f1_rf,
    'predictions': y_pred_rf
}

print(f" Accuracy: {acc_rf:.4f}")
print(f" F1-Score: {f1_rf:.4f}")



# ---------- MODELO 3: SVM - RBF Kernel ----------
print("\n Modelo 3: Support Vector Machine (SVM) - RBF Kernel")
print("-" * 50)

svm_rbf = SVC(kernel='rbf', random_state=42)
svm_rbf.fit(X_train_scaled, y_train)
y_pred_svm_rbf = svm_rbf.predict(X_test_scaled)

acc_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
f1_svm_rbf = f1_score(y_test, y_pred_svm_rbf, average='weighted')

resultados['SVM (RBF)'] = {
    'modelo': svm_rbf,
    'accuracy': acc_svm_rbf,
    'f1_score': f1_svm_rbf,
    'predictions': y_pred_svm_rbf
}

print(f" Accuracy: {acc_svm_rbf:.4f}")
print(f" F1-Score: {f1_svm_rbf:.4f}")




# ---------- MODELO 4: SVM - Linear Kernel ----------
print("\n Modelo 4: Support Vector Machine (SVM) - Linear Kernel")
print("-" * 50)

svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train_scaled, y_train)
y_pred_svm_linear = svm_linear.predict(X_test_scaled)

acc_svm_linear = accuracy_score(y_test, y_pred_svm_linear)
f1_svm_linear = f1_score(y_test, y_pred_svm_linear, average='weighted')

resultados['SVM (Linear)'] = {
    'modelo': svm_linear,
    'accuracy': acc_svm_linear,
    'f1_score': f1_svm_linear,
    'predictions': y_pred_svm_linear
}

print(f" Accuracy: {acc_svm_linear:.4f}")
print(f" F1-Score: {f1_svm_linear:.4f}")




# ---------- MODELO 5: SVM - Polynomial Kernel ----------
print("\n Modelo 5: Support Vector Machine (SVM) - Polynomial Kernel")
print("-" * 50)

svm_poly = SVC(kernel='poly', degree=3, random_state=42)
svm_poly.fit(X_train_scaled, y_train)
y_pred_svm_poly = svm_poly.predict(X_test_scaled)

acc_svm_poly = accuracy_score(y_test, y_pred_svm_poly)
f1_svm_poly = f1_score(y_test, y_pred_svm_poly, average='weighted')

resultados['SVM (Polynomial)'] = {
    'modelo': svm_poly,
    'accuracy': acc_svm_poly,
    'f1_score': f1_svm_poly,
    'predictions': y_pred_svm_poly
}

print(f" Accuracy: {acc_svm_poly:.4f}")
print(f" F1-Score: {f1_svm_poly:.4f}")





# ---------- MODELO 6: SVM - Sigmoid Kernel ----------
print("\n Modelo 6: Support Vector Machine (SVM) - Sigmoid Kernel")
print("-" * 50)

svm_sigmoid = SVC(kernel='sigmoid', random_state=42)
svm_sigmoid.fit(X_train_scaled, y_train)
y_pred_svm_sigmoid = svm_sigmoid.predict(X_test_scaled)

acc_svm_sigmoid = accuracy_score(y_test, y_pred_svm_sigmoid)
f1_svm_sigmoid = f1_score(y_test, y_pred_svm_sigmoid, average='weighted')

resultados['SVM (Sigmoid)'] = {
    'modelo': svm_sigmoid,
    'accuracy': acc_svm_sigmoid,
    'f1_score': f1_svm_sigmoid,
    'predictions': y_pred_svm_sigmoid
}

print(f" Accuracy: {acc_svm_sigmoid:.4f}")
print(f" F1-Score: {f1_svm_sigmoid:.4f}")




# ---------- MODELO 7: REGRESI√ìN LOG√çSTICA ----------
print("\n Modelo 7: Regresi√≥n Log√≠stica")
print("-" * 50)

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial', solver='lbfgs')
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)

acc_log_reg = accuracy_score(y_test, y_pred_log_reg)
f1_log_reg = f1_score(y_test, y_pred_log_reg, average='weighted')

resultados['Regresi√≥n Log√≠stica'] = {
    'modelo': log_reg,
    'accuracy': acc_log_reg,
    'f1_score': f1_log_reg,
    'predictions': y_pred_log_reg
}

print(f" Accuracy: {acc_log_reg:.4f}")
print(f" F1-Score: {f1_log_reg:.4f}")

ENTRENAMIENTO DE MODELOS

Modelo 1: K-Nearest Neighbors (KNN)
--------------------------------------------------
Buscando el mejor valor de k...
 Mejor k encontrado: 17 (accuracy: 0.5943)
 Accuracy: 0.6174
 F1-Score: 0.5832

 Modelo 2: Decision Tress
--------------------------------------------------
 Accuracy: 0.5034
 F1-Score: 0.4591

 Modelo 2b Random Forest
--------------------------------------------------
 Accuracy: 0.7852
 F1-Score: 0.7755

 Modelo 3: Support Vector Machine (SVM) - RBF Kernel
--------------------------------------------------
 Accuracy: 0.7181
 F1-Score: 0.6972

 Modelo 4: Support Vector Machine (SVM) - Linear Kernel
--------------------------------------------------
 Accuracy: 0.6846
 F1-Score: 0.6525

 Modelo 5: Support Vector Machine (SVM) - Polynomial Kernel
--------------------------------------------------
 Accuracy: 0.5638
 F1-Score: 0.4864

 Modelo 6: Support Vector Machine (SVM) - Sigmoid Kernel
--------------------------------------------------
 Accura

## COMPARACI√ìN DE MODELOS

In [5]:
print("COMPARACI√ìN DE MODELOS")

comparacion = pd.DataFrame({
    'Modelo': list(resultados.keys()),
    'Accuracy': [resultados[m]['accuracy'] for m in resultados],
    'F1-Score': [resultados[m]['f1_score'] for m in resultados]
})

print("\n", comparacion.to_string(index=False))

mejor_modelo_nombre = comparacion.loc[comparacion['Accuracy'].idxmax(), 'Modelo']
mejor_modelo = resultados[mejor_modelo_nombre]['modelo']
mejor_predictions = resultados[mejor_modelo_nombre]['predictions']

print(f"\n MEJOR MODELO: {mejor_modelo_nombre}")
print(f" Accuracy: {resultados[mejor_modelo_nombre]['accuracy']:.4f}")
print(f" F1-Score: {resultados[mejor_modelo_nombre]['f1_score']:.4f}")



# Crear carpeta si no existe
os.makedirs("../reports/models", exist_ok=True)
# Crear nombre del archivo con fecha/hora
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
nombre_archivo = f"../reports/models/resultados_modelos_{timestamp}.txt"

# Guardar los resultados
with open(nombre_archivo, "w", encoding="utf-8") as f:
    f.write("COMPARACI√ìN DE MODELOS\n")
    f.write("="*60 + "\n\n")
    f.write(comparacion.to_string(index=False))
    
    # Escribir mejor modelo
    f.write(f"\n\nMEJOR MODELO: {mejor_modelo_nombre}\n")
    f.write(f"Accuracy: {resultados[mejor_modelo_nombre]['accuracy']:.4f}\n")
    f.write(f"F1-Score: {resultados[mejor_modelo_nombre]['f1_score']:.4f}\n")

print(f"Resultados guardados en: {nombre_archivo}")


COMPARACI√ìN DE MODELOS

              Modelo  Accuracy  F1-Score
                KNN  0.617450  0.583190
     Decision Tress  0.503356  0.459053
      Random Forest  0.785235  0.775460
          SVM (RBF)  0.718121  0.697175
       SVM (Linear)  0.684564  0.652470
   SVM (Polynomial)  0.563758  0.486408
      SVM (Sigmoid)  0.583893  0.571349
Regresi√≥n Log√≠stica  0.691275  0.662331

 MEJOR MODELO: Random Forest
 Accuracy: 0.7852
 F1-Score: 0.7755
Resultados guardados en: ../reports/models/resultados_modelos_2025-11-10_19-50-53.txt


## REPORTE DETALLADO DEL MEJOR MODELO

In [6]:
print(f"REPORTE DETALLADO - {mejor_modelo_nombre}")

print("\nReporte de Clasificaci√≥n:")
print(classification_report(y_test, mejor_predictions,target_names=label_encoder.classes_))

REPORTE DETALLADO - Random Forest

Reporte de Clasificaci√≥n:
              precision    recall  f1-score   support

       blues       1.00      0.75      0.86         4
     clasica       1.00      0.67      0.80         3
     country       0.82      0.82      0.82        11
 electronica       1.00      0.92      0.96        12
      hiphop       0.80      0.93      0.86        30
    hyperpop       1.00      0.56      0.71         9
        jazz       0.80      1.00      0.89         8
        kpop       0.47      0.57      0.52        14
         pop       0.33      0.20      0.25        10
    regueton       0.73      0.61      0.67        18
        rock       0.88      1.00      0.94        30

    accuracy                           0.79       149
   macro avg       0.80      0.73      0.75       149
weighted avg       0.79      0.79      0.78       149



## BUSQUEDA DE LOS MEJORES HIPERPARAMETROS PARA RANDOM FOREST

In [7]:
# üîß 1Ô∏è‚É£ Definir el modelo base
rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)

# üîç 2Ô∏è‚É£ Definir la grilla de hiperpar√°metros a probar
param_grid = {
    'n_estimators': [100,150],              # N√∫mero de √°rboles
    'max_depth': [13,14,15],              # Profundidad m√°xima de los √°rboles
    'min_samples_split': [4,5,6],              # M√≠nimo de muestras para dividir un nodo
    'min_samples_leaf': [1,2,3],                # M√≠nimo de muestras por hoja
    'class_weight': ['balanced'],              # Peso de las clases
    'criterion': ['gini'],          # Medida
    'bootstrap': [False]                    # Si usar muestreo con reemplazo
}

# ‚öôÔ∏è 3Ô∏è‚É£ Configurar la b√∫squeda con validaci√≥n cruzada
grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    cv=5,                      # 5 particiones de validaci√≥n cruzada
    scoring='f1_weighted',     # m√©trica para comparar
    n_jobs=-1,                 # usa todos los n√∫cleos disponibles
    verbose=2                  # muestra el progreso
)

# üöÄ 4Ô∏è‚É£ Ejecutar la b√∫squeda
grid_search.fit(X_train_scaled, y_train)

# üìà 5Ô∏è‚É£ Mostrar los mejores resultados
print("‚úÖ Mejor combinaci√≥n de hiperpar√°metros:")
print(grid_search.best_params_)
print(f"\nüèÜ Mejor F1-score promedio: {grid_search.best_score_:.4f}")

# üíæ 6Ô∏è‚É£ Guardar el mejor modelo
mejor_modelo = grid_search.best_estimator_

# üß© 7Ô∏è‚É£ Evaluar en el conjunto de prueba
y_pred_best = mejor_modelo.predict(X_test_scaled)
print("\nüìä Evaluaci√≥n del mejor modelo:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_best, average='weighted'):.4f}")
print("\nReporte de Clasificaci√≥n:")
print(classification_report(y_test, y_pred_best))


Fitting 5 folds for each of 54 candidates, totalling 270 fits
‚úÖ Mejor combinaci√≥n de hiperpar√°metros:
{'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 14, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}

üèÜ Mejor F1-score promedio: 0.7268

üìä Evaluaci√≥n del mejor modelo:
Accuracy: 0.7987
F1-score: 0.7941

Reporte de Clasificaci√≥n:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       1.00      1.00      1.00         3
           2       1.00      0.82      0.90        11
           3       1.00      0.92      0.96        12
           4       0.80      0.93      0.86        30
           5       1.00      0.56      0.71         9
           6       0.89      1.00      0.94         8
           7       0.47      0.64      0.55        14
           8       0.56      0.50      0.53        10
           9       0.69      0.50      0.58        18
        

# MODELO FINAL

In [None]:
print("\n Mejor Modelo Modificado: Random Forest")
print("-" * 50)

rf = RandomForestClassifier(
    n_estimators=150,
    random_state=42,
    n_jobs=-1,
    max_depth=14,
    min_samples_leaf=1,
    min_samples_split=5,
    class_weight='balanced',
    criterion='gini',
    bootstrap = False

    )

rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

resultados['Random Forest'] = {
    'modelo': rf,
    'accuracy': acc_rf,
    'f1_score': f1_rf,
    'predictions': y_pred_rf
}

print(f" Accuracy: {acc_rf:.4f}")
print(f" F1-Score: {f1_rf:.4f}")


 Modelo 2: Random Forest
--------------------------------------------------
 Accuracy: 0.7987
 F1-Score: 0.7941
