# ***Modelo KNN (K-Vecinos Más Cercanos)***

In [None]:
%load_ext kedro.ipython 

In [None]:
catalog.keys()

In [None]:
catalog.load("model_input_table")

In [None]:
df_FIFA = catalog.load("model_input_table")

#### **Importaciones**

In [None]:
# -- Tratamiento de datos --
import numpy as np
import pandas as pd

# -- Gráficos -- 
import seaborn as sns
from matplotlib import style
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
import seaborn as sb

# -- Procesado y modelado --
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# -- Metricas para modelos de clasificación --
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

#----  Curva ROC y PR ----
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay, average_precision_score

# -- GridSearchCV -- 
from sklearn.model_selection import GridSearchCV

In [None]:
numeric_df = df_FIFA.select_dtypes(include=np.number)

correlation_matrix = numeric_df.corr()

plt.figure(figsize=(30, 20))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numeric Columns')
plt.show()

## **Desarrollo del Modelo KNN**

#### **Selección de caracteristicas**

In [None]:
X = df_FIFA[["Potential",'Reactions','Composure']] 
y = df_FIFA[['Overall_Class_Bin']]
# posibles Target para clasificacion

'''
Overall_Class  
Overall_Class_Encoded  
Best Position  
Position  
Preferred Foot  
Work Rate  
Body Type  
Nationality  
Club  
Best_Position_Grouped_FW  
Best_Position_Grouped_GK  
Best_Position_Grouped_MF
'''
#Preferred Foot noo?

#### **División de los datos en entrenamiento y prueba**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30 ,random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### **Creación y entrenamiento del modelo KNN**

In [None]:
n_neighbors = 200 

modelo_KNN = KNeighborsClassifier(n_neighbors)
modelo_KNN.fit(X_train, y_train) 

#### **Predicciones sobre el conjunto de prueba**

In [None]:
y_pred = modelo_KNN.predict(X_test)

### **Métricas de evaluación**

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(8, 6))
sb.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for KNN Model")
plt.show()

In [None]:
#creo que funciona para binarias 
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Calculate Sensitivity (Recall)
sensitivity = tp / (tp + fn)

# Calculate Specificity
specificity = tn / (tn + fp)

print(f"Sensitivity (Recall): {sensitivity:.2f}")
print(f"Specificity: {specificity:.2f}")

In [None]:

# Crear un mapa de colores
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00'])

# Paso de la malla
h = .02

# Rango para las dos primeras features
x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1
y_min, y_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# Número total de features esperadas por el modelo
n_features = modelo_KNN.n_features_in_

# Construir matriz de inputs con el promedio para el resto de variables
X_grid = np.tile(np.mean(X_test, axis=0), (xx.ravel().shape[0], 1))
X_grid[:, 0] = xx.ravel()
X_grid[:, 1] = yy.ravel()

# Predecir clases
Z = modelo_KNN.predict(X_grid)
Z = Z.reshape(xx.shape)

# Graficar
plt.figure(figsize=(10, 8))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test.to_numpy().flatten(), cmap=cmap_bold, edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title(f"Clasificación KNN (k = {modelo_KNN.n_neighbors})")
plt.xlabel('Característica 1 (Escalada)')
plt.ylabel('Característica 2 (Escalada)')

# Leyenda
patch_0 = mpatches.Patch(color='#FF0000', label='Clase 0')
patch_1 = mpatches.Patch(color='#00FF00', label='Clase 1')
plt.legend(handles=[patch_0, patch_1])

plt.show()


#### **Curva ROC**

In [None]:
# Probabilidades de predicción
y_pred_proba = modelo_KNN.predict_proba(X_test)[:,1]

# Calcular la curva ROC y el área bajo la curva
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Graficar la curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='Curva ROC (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva Característica de Operación del Receptor (ROC)')
plt.legend(loc="lower right")
plt.show()

#### **Curva PR**

In [None]:
# Obtener probabilidades del modelo KNN (clase positiva)
y_pred_proba = modelo_KNN.predict_proba(X_test)[:, 1]

# Calcular precision, recall y thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Calcular el área bajo la curva PR
ap_score = average_precision_score(y_test, y_pred_proba)

# Graficar
fig, ax = plt.subplots()
pr_display = PrecisionRecallDisplay(precision=precision, recall=recall)
pr_display.plot(ax=ax)
ax.set_title(f'Precision-Recall Curve for KNN Model (AP = {ap_score:.2f})')
plt.grid(True)
plt.tight_layout()
plt.show()

### **GridsearchCV**

In [None]:
#param_grid = {'n_neighbors': np.arange(1, 40)}

param_grid = {
    "n_neighbors": [3, 5, 7],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"],
    "algorithm": ["auto"],
    "p": [1, 2]
}


# Inicializar el modelo KNN
knn = KNeighborsClassifier()

# Inicializar GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy') # cv=5 para 5-fold cross-validation

# Entrenar GridSearchCV con los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Imprimir los mejores parámetros y la mejor puntuación
print("Mejores parámetros:", grid_search.best_params_)
print("Mejor puntuación de cross-validation:", grid_search.best_score_)

# Obtener el mejor modelo entrenado
best_knn = grid_search.best_estimator_

# Evaluar el mejor modelo en el conjunto de prueba
test_accuracy = best_knn.score(X_test, y_test)
print("Accuracy en el conjunto de prueba con los mejores parámetros:", test_accuracy)

# Predecir con el mejor modelo
y_pred_gs = best_knn.predict(X_test)


In [None]:

# Mostrar métricas de clasificación para el mejor modelo
print("\nClassification Report con los mejores parámetros:")
print(classification_report(y_test, y_pred_gs))

print("\nConfusion Matrix con los mejores parámetros:")
plt.figure(figsize=(8, 6))
sb.heatmap(confusion_matrix(y_test, y_pred_gs), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for KNN Model")
plt.show()

print("\nAccuracy Score (Grid Search):")
print(accuracy_score(y_test, y_pred_gs))

In [None]:
X_test_np = X_test.to_numpy() if hasattr(X_test, "to_numpy") else X_test
y_test_np = y_test.to_numpy().flatten() if hasattr(y_test, "to_numpy") else y_test

# Crear malla
h = 0.02
x_min, x_max = X_test_np[:, 0].min() - 1, X_test_np[:, 0].max() + 1
y_min, y_max = X_test_np[:, 1].min() - 1, X_test_np[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# Construir matriz de inputs para predecir
n_samples = xx.ravel().shape[0]
n_features = best_knn.n_features_in_
X_grid = np.tile(np.mean(X_test_np, axis=0), (n_samples, 1))
X_grid[:, 0] = xx.ravel()
X_grid[:, 1] = yy.ravel()

# Predecir clases
Z = best_knn.predict(X_grid)
Z = Z.reshape(xx.shape)

# Graficar
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00'])

plt.figure(figsize=(10, 8))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')
plt.scatter(X_test_np[:, 0], X_test_np[:, 1], c=y_test_np, cmap=cmap_bold, edgecolor='k', s=30)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title(f"Frontera de decisión KNN (k={best_knn.n_neighbors})")

# Leyenda
patch_0 = mpatches.Patch(color='#FF0000', label='Clase 0')
patch_1 = mpatches.Patch(color='#00FF00', label='Clase 1')
plt.legend(handles=[patch_0, patch_1])

plt.show()

#### **Sensitivity y Specificity modelo con GridSearchCV**

In [None]:
cm_best = confusion_matrix(y_test, y_pred_gs)

TN_best = cm_best[0, 0]
FP_best = cm_best[0, 1]
FN_best = cm_best[1, 0]
TP_best = cm_best[1, 1]

sensitivity_best = TP_best / (TP_best + FN_best)
specificity_best = TN_best / (TN_best + FP_best)

print(f"\nSensitivity (Best Model): {sensitivity_best:.4f}")
print(f"Specificity (Best Model): {specificity_best:.4f}")

### **Curva Roc y Curva PR de GridSearchCV**

In [None]:
# Probabilidades para la clase positiva
y_pred_proba = best_knn.predict_proba(X_test)[:, 1]

# === CURVA ROC ===
fpr, tpr, thresholds_roc = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='Curva ROC (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC del Mejor Modelo (KNN)')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

print("-")
# === CURVA PRECISIÓN-RECALL ===
precision, recall, thresholds_pr = precision_recall_curve(y_test, y_pred_proba)
average_precision = average_precision_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, label='Curva PR (AP = %0.2f)' % average_precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall del Mejor Modelo (KNN)')
plt.legend(loc="lower left")
plt.grid(True)
plt.show()