# ***Modelo Decision Tree Classifier***

In [None]:
%load_ext kedro.ipython 

In [None]:
catalog.keys()

In [None]:
catalog.load("model_input_table")

In [None]:
df_FIFA = catalog.load("model_input_table")

**Importaciones**

In [None]:
#### # -- Tratamiento de datos --
import numpy as np
import pandas as pd

# -- Gráficos -- 
import seaborn as sns
from matplotlib import style
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
import seaborn as sb

# -- Procesado y modelado --
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import sklearn.tree # Árboles de decisión

# -- Metricas para modelos de clasificación --
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

#----  Curva ROC y PR ----
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay, average_precision_score

# -- GridSearchCV -- 
from sklearn.model_selection import GridSearchCV

#### **Selección de caracteristicas**

In [None]:
X = df_FIFA[["Age","Finishing","Dribbling","International Reputation"]]
y = df_FIFA['Overall_Class_Bin'] #no doble corchete?

#### **División de los datos en entrenamiento y prueba**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

#### **Creación y entrenamiento del modelo Linear Regression**

In [None]:
modelo = sklearn.tree.DecisionTreeClassifier(   max_depth=5,           # limita profundidad
    min_samples_leaf=100,  # cada hoja tiene al menos 100 ejemplos
    random_state=42
    # Forzamos que nuestro árbol sólo tenga 10 niveles de profundidad.
    )



In [None]:
modelo.fit(X_train, y_train)

#### **Predicciones sobre el conjunto de prueba**


In [None]:
y_pred = modelo.predict(X_test)

### **Métricas de evaluación**

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(8, 6))
sb.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for KNN Model")
plt.show()

#### **Calculando Sensitivity Y Specificity**

In [None]:
cm = confusion_matrix(y_test, y_pred)

# Extrae los valores de la matriz de confusión
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

# Calcula la sensibilidad (Recall)
sensitivity = TP / (TP + FN)

# Calcula la especificidad
specificity = TN / (TN + FP)

print(f"\nSensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")

#### **Visualizando el arbol de decisión**

In [None]:
plt.figure(figsize=(20,10))
tree.plot_tree(modelo, feature_names=X.columns, class_names=['-', '-'], filled=True)
plt.show()

#### **Curva ROC**

In [None]:
y_pred_proba = modelo.predict_proba(X_test)[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Calculate AUC
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

print(f"\nAUC: {roc_auc}")

#### **Curva PR**

In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)

# Calculate Average Precision
average_precision = average_precision_score(y_test, y_pred_proba)

# Plot Precision-Recall curve
plt.figure()
plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve (area = %0.2f)' % average_precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

print(f"\nAverage Precision (AP): {average_precision}")


### **GridsearchCV**

In [None]:
# Modelo base
model = DecisionTreeClassifier(random_state=42)

# Rango de hiperparámetros
param_grid = {
    'class_weight': [None, {0:1, 1:1}, {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:3}],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'  # o 'f1_macro'
)

# Ajustar a los datos
grid_search.fit(X_train, y_train)

# Resultados
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)

print("\nMejor puntuación de validación cruzada:")
print(grid_search.best_score_)

best_model = grid_search.best_estimator_

# Evaluar en el test
y_pred_best = best_model.predict(X_test)
print("\nEvaluación en el conjunto de prueba:")
print("Classification Report:")
print(classification_report(y_test, y_pred_best))
print("\nConfusion Matrix:")
plt.figure(figsize=(8, 6))

print("\nConfusion Matrix con los mejores parámetros:")
sb.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Decision Tree Classification Model")
plt.show()

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred_best))

#### **Sensitivity y Specificity modelo con GridSearchCV**

In [None]:
cm_best = confusion_matrix(y_test, y_pred_best)

TN_best = cm_best[0, 0]
FP_best = cm_best[0, 1]
FN_best = cm_best[1, 0]
TP_best = cm_best[1, 1]

sensitivity_best = TP_best / (TP_best + FN_best)
specificity_best = TN_best / (TN_best + FP_best)

print(f"\nSensitivity (Best Model): {sensitivity_best:.4f}")
print(f"Specificity (Best Model): {specificity_best:.4f}")

#### **Grafica del mejor árbol de decisión GridSearchCV**


In [None]:
plt.figure(figsize=(20, 10))
tree.plot_tree(best_model, feature_names=X.columns, class_names=['Loss', 'Win'], filled=True)
plt.show()
     

#### **Curva ROC y PR GridSearchCV**

In [None]:
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]

# Calcular la curva ROC para el mejor modelo
fpr_best, tpr_best, thresholds_best = roc_curve(y_test, y_pred_proba_best)
roc_auc_best = auc(fpr_best, tpr_best)

# Graficar la curva ROC para el mejor modelo
plt.figure()
plt.plot(fpr_best, tpr_best, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_best)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Best Model')
plt.legend(loc="lower right")
plt.show()

print(f"\nAUC for Best Model: {roc_auc_best}")

# Calcular la curva Precision-Recall para el mejor modelo
precision_best, recall_best, _ = precision_recall_curve(y_test, y_pred_proba_best)
average_precision_best = average_precision_score(y_test, y_pred_proba_best)

# Graficar la curva Precision-Recall para el mejor modelo
plt.figure()
plt.plot(recall_best, precision_best, color='blue', lw=2, label='Precision-Recall curve (area = %0.2f)' % average_precision_best)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Best Model')
plt.legend(loc="lower left")
plt.show()

print(f"\nAverage Precision (AP) for Best Model: {average_precision_best}")