# Fase 4: Modeling - Clasification (Categorical)


In [None]:
# Celda 1: Importar librerías y preparar datos

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing  import StandardScaler
from sklearn.pipeline        import Pipeline
from sklearn.linear_model    import LogisticRegression
from sklearn.svm             import SVC
from sklearn.tree            import DecisionTreeClassifier
from sklearn.ensemble        import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network  import MLPClassifier
from sklearn.metrics         import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Cargar datos
data_path = '/Users/luissalamanca/Desktop/Duoc/Machine/ML_Proyecto_Semestral/data/03_features/engineered_data.csv'
data = pd.read_csv(data_path, sep=';')

# Separar columnas concatenadas
if len(data.columns) == 1:
    column_name = data.columns[0]
    if ',' in column_name:
        new_columns = column_name.split(',')
        data_split = data[column_name].str.split(',', expand=True)
        data_split.columns = new_columns
        for col in data_split.columns:
            data_split[col] = pd.to_numeric(data_split[col], errors='coerce')
        data = data_split

# Crear variable objetivo multiclase
data['EffectivenessLevel'] = pd.cut(
    data['EffectivenessScore'].astype(float),
    bins=[-0.1, 0.5, 1.5, 5, np.inf],
    labels=['Bajo', 'Medio', 'Alto', 'Experto']
)

print(data['EffectivenessLevel'].value_counts())
print('\nEstadísticas de EffectivenessScore:\n', data['EffectivenessScore'].astype(float).describe())

# Visualizar distribución de las clases
plt.figure(figsize=(6, 3))
sns.countplot(x='EffectivenessLevel', data=data, order=['Bajo', 'Medio', 'Alto', 'Experto'])
plt.title('Distribución de clases: EffectivenessLevel')
plt.show()

# Lista de features y target
features = [
    'EconomicEfficiency',
    'EquipmentAdvantage',
    'KillAssistRatio',
    'StealthKillsRatio',
    'KDA'
]
X = data[features]
y = data['EffectivenessLevel']

# Dividir en entrenamiento/prueba (30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Escalamiento de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Convertir los arrays escalados en DataFrame para fácil indexación por nombre
X_train_scaled_df = pd.DataFrame(
    X_train_scaled,
    columns=features,
    index=X_train.index
)
X_test_scaled_df  = pd.DataFrame(
    X_test_scaled,
    columns=features,
    index=X_test.index
)

print("Formas de los datos:")
print(f"  X_train:           {X_train.shape}")
print(f"  X_test:            {X_test.shape}")
print(f"  X_train_scaled_df: {X_train_scaled_df.shape}")
print(f"  X_test_scaled_df:  {X_test_scaled_df.shape}")


In [None]:
# Pipeline y parámetros para GridSearch
pipe_lr = Pipeline([
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

param_grid_lr = {
    'clf__C': [0.1, 1, 10],
    'clf__solver': ['lbfgs', 'liblinear']
}

grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train_scaled_df, y_train)

# Predicciones y métricas
y_pred_lr = grid_lr.predict(X_test_scaled_df)

print("Mejores parámetros:", grid_lr.best_params_)
print(classification_report(y_test, y_pred_lr, digits=3))

cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de confusión - Logistic Regression")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.show()


In [None]:
pipe_rf = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'clf__n_estimators': [50, 100],
    'clf__max_depth': [None, 10, 20]
}

grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train_scaled_df, y_train)

y_pred_rf = grid_rf.predict(X_test_scaled_df)

print("Mejores parámetros:", grid_rf.best_params_)
print(classification_report(y_test, y_pred_rf, digits=3))

cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de confusión - Random Forest")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.show()


In [None]:
pipe_svc = Pipeline([
    ('clf', SVC())
])

param_grid_svc = {
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf']
}

grid_svc = GridSearchCV(pipe_svc, param_grid_svc, cv=3, scoring='f1', n_jobs=-1)
grid_svc.fit(X_train_scaled_df, y_train)

y_pred_svc = grid_svc.predict(X_test_scaled_df)

print("Mejores parámetros:", grid_svc.best_params_)
print(classification_report(y_test, y_pred_svc, digits=3))

cm = confusion_matrix(y_test, y_pred_svc)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de confusión - SVC")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.show()


In [None]:
pipe_gb = Pipeline([
    ('clf', GradientBoostingClassifier(random_state=42))
])

param_grid_gb = {
    'clf__n_estimators': [50, 100],
    'clf__learning_rate': [0.05, 0.1, 0.2]
}

grid_gb = GridSearchCV(pipe_gb, param_grid_gb, cv=3, scoring='f1', n_jobs=-1)
grid_gb.fit(X_train_scaled_df, y_train)

y_pred_gb = grid_gb.predict(X_test_scaled_df)

print("Mejores parámetros:", grid_gb.best_params_)
print(classification_report(y_test, y_pred_gb, digits=3))

cm = confusion_matrix(y_test, y_pred_gb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de confusión - Gradient Boosting")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.show()


In [None]:
pipe_mlp = Pipeline([
    ('clf', MLPClassifier(max_iter=300, random_state=42))
])

param_grid_mlp = {
    'clf__hidden_layer_sizes': [(50,), (100,)],
    'clf__activation': ['relu', 'tanh']
}

grid_mlp = GridSearchCV(pipe_mlp, param_grid_mlp, cv=3, scoring='f1', n_jobs=-1)
grid_mlp.fit(X_train_scaled_df, y_train)

y_pred_mlp = grid_mlp.predict(X_test_scaled_df)

print("Mejores parámetros:", grid_mlp.best_params_)
print(classification_report(y_test, y_pred_mlp, digits=3))

cm = confusion_matrix(y_test, y_pred_mlp)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de confusión - MLPClassifier")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.show()
