## **08.KNN**

## **1. Instalaciones y Importaciones**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import *
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KNeighborsRegressor
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
import shap
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('winequality-red.csv', sep=';')
pd.set_option('display.max_columns', None)
df.head(10)

In [None]:
df.shape

In [None]:
for col in df.columns:
    print(f"\nColumna: {col}")

    # Conteos absolutos
    conteos = df[col].value_counts(dropna=False)

    # Porcentajes
    porcentajes = df[col].value_counts(normalize=True, dropna=False) * 100

    # Combinar en un solo DataFrame
    resumen = pd.DataFrame({
        'Conteo': conteos,
        'Porcentaje (%)': porcentajes.round(2)
    })

    print(resumen)

In [None]:
n_cols = len(df.columns)

# Definir cuántos gráficos por fila
cols_per_row = 3  # Puedes cambiarlo según prefieras
rows = math.ceil(n_cols / cols_per_row)

# Crear figure con subplots
fig, axes = plt.subplots(rows, cols_per_row, figsize=(7*cols_per_row, 5*rows))

# Si hay solo una fila o columna, axes puede ser 1D o 2D, hacemos lista para iterar
axes = axes.flatten()  # Convierte todos los ejes en una lista 1D

for i, col in enumerate(df.columns):
    conteos = df[col].value_counts(dropna=False)
    sns.barplot(x=conteos.index.astype(str), y=conteos.values, ax=axes[i])
    axes[i].set_title(f'{col}', fontsize=14)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('Conteo', fontsize=12)
    axes[i].tick_params(axis='x', rotation=45, labelsize=10)
    axes[i].tick_params(axis='y', labelsize=10)

# Ocultar ejes vacíos si hay más subplots que columnas
for j in range(i+1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
unicos = df.nunique()
df[unicos[unicos==1].index]
df[unicos[unicos==1].index].head(5)

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df[df.duplicated(keep=False)]


In [None]:
df.T.duplicated().sum()

In [None]:
df.T.duplicated(keep=False)

In [None]:
pd.set_option('display.max_columns', None)
df.head(10)

In [None]:
df.shape

In [None]:
faltantes = df.isna().mean()*100
faltantes[faltantes>0]

In [None]:
df.isnull().any(axis=1).sum()

In [None]:
n = 2
df[df.isnull().sum(axis=1) ==n]

In [None]:
n = 3
df[df.isnull().sum(axis=1) ==n]

In [None]:
n = 4
df[df.isnull().sum(axis=1) ==n]

In [None]:
n = 5
df[df.isnull().sum(axis=1) ==n]

In [None]:
categoricas = df.select_dtypes(include=['object', 'category']).columns
print(categoricas)

In [None]:
# Dividir el conjunto de datos en características (X) y etiquetas (y)
X = df[['fixed acidity',	'volatile acidity',	 'citric acid',	 'residual sugar',	'chlorides',	'free sulfur dioxide',	'total sulfur dioxide',	'density',	'pH',	'sulphates',	'alcohol']]
y = df['quality']

# Dividir el conjunto de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
df.shape

In [None]:
faltantes = df.isna().mean()*100
faltantes[faltantes>0]

In [None]:
msno.matrix(df)

In [None]:
X_train.describe().T

In [None]:
iso = IsolationForest(random_state=123)
df_num = X_train.copy()
df_num['outlier_flag'] = iso.fit_predict(X_train)

In [None]:
df_num['outlier_flag'].value_counts()

Método 1: Feature importance de RF

In [None]:
# Entrenamiento de RF
model = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor(
        n_neighbors=5,
        weights="distance",
        n_jobs=-1
    ))
])

model.fit(X_train, y_train)

In [None]:
r = permutation_importance(
    model, X_train, y_train,
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

importances = r.importances_mean * 100

df_knn_imp = pd.DataFrame({
    "feature": X_train.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False)

In [None]:
df_knn_imp

Método 2: Permutation/Shuffle importance

In [None]:
X_train1, X_val, y_train1, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# 2️⃣ Modelo KNN con escalado (OBLIGATORIO)
model_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor(
        n_neighbors=5,
        weights="distance",
        n_jobs=-1
    ))
])

model_knn.fit(X_train1, y_train1)

In [None]:
# Realizamos 10 permutaciones por cada característica
perm = permutation_importance(
    model_knn,
    X_val,
    y_val,
    n_repeats=10,
    random_state=42,
    n_jobs=-1,
    scoring="neg_mean_absolute_percentage_error"
)

In [None]:
df_perm_imp = pd.DataFrame({
    "feature": X_train.columns,
    "perm_imp": perm.importances_mean * 100
}).sort_values("perm_imp", ascending=False)

df_perm_imp

Método 3: SHAP

In [None]:
model_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor(
        n_neighbors=5,
        weights="distance",
        n_jobs=-1
    ))
])

model_knn.fit(X_train1, y_train1)
X_bg = X_train1.sample(50, random_state=42)     # background
X_val_sample = X_val.sample(100, random_state=42)


explainer = shap.KernelExplainer(
    lambda x: model_knn.predict(x),
    X_bg
)

shap_vals = explainer.shap_values(X_val_sample)

In [None]:
imp_shap = np.abs(shap_vals).mean(axis=0)
imp_shap_pct = imp_shap / imp_shap.sum() * 100

df_shap_imp = pd.DataFrame({
    "feature": X_val.columns,
    "shap_imp": imp_shap_pct
}).sort_values("shap_imp", ascending=False)

df_shap_imp

In [None]:
# --- SHAP Summary Bar Plot ---
shap.summary_plot(
    shap_vals,
    X_val_sample,
    plot_type="bar"
)

Unión 3 métodos

In [None]:
df_knn_imp = df_knn_imp.rename(columns={"importance": "perm_train"})
df_perm_imp = df_perm_imp.rename(columns={"perm_imp": "perm_val"})
df_importances = (
    df_knn_imp
    .merge(df_perm_imp, on="feature", how="outer")
    .merge(df_shap_imp, on="feature", how="outer")
)

In [None]:
df_importances["mean_importance"] = (
    df_importances[["perm_train", "perm_val", "shap_imp"]]
    .mean(axis=1)
)
df_importances = df_importances.sort_values(
    "mean_importance", ascending=False
)

In [None]:
df_importances

In [None]:
df_filt = df_importances.head(10)
features = df_filt['feature']
features

In [None]:
from statsmodels.tools.tools import add_constant

X_vif = add_constant(X_train[features])

vif = pd.Series([variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])],index=X_vif.columns)
vif = vif.drop("const")
vif

In [None]:
X_copy = X_train[features].copy()
X_copy['price'] = y_train
corr = X_copy.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(20, 15))
sns.heatmap(
    corr, mask=mask, cmap='coolwarm',
    annot=True, fmt=".2f", linewidths=.5,
    cbar_kws={"shrink": .8},
    annot_kws={"size": 8}
)
plt.title("Matriz de correlaciones (mitad inferior)", fontsize=16, pad=15)
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.yticks(fontsize=9)
plt.tight_layout(pad=3.0)
plt.show()

In [None]:
# Versión 1: Realizar cross_validation con todas las características que son candidatas
model_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor(
        n_neighbors=5,
        weights="distance",
        n_jobs=-1
    ))
])

cv_results1 = cross_val_score(
    model_knn,
    X_train[features],
    y_train,
    cv=3,
    scoring="neg_mean_absolute_percentage_error",
    n_jobs=-1
)

cv_results1.mean() * -1

In [None]:
cv_results1.mean()*-1

In [None]:
knn_no_scaling = KNeighborsClassifier(
    n_neighbors=3,
    algorithm="brute"
)

yhat_no_scaling = knn_no_scaling.fit(
    X_train, y_train
).predict(X_test)
pipe_standard = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(
        n_neighbors=3,
        algorithm="brute"
    ))
])

yhat_standard = pipe_standard.fit(
    X_train, y_train
).predict(X_test)

pipe_norm = Pipeline([
    ("scaler", MinMaxScaler()),
    ("knn", KNeighborsClassifier(
        n_neighbors=3,
        algorithm="brute"
    ))
])

yhat_norm = pipe_norm.fit(
    X_train, y_train
).predict(X_test)

In [None]:
print("No scaling:", accuracy_score(y_test, yhat_no_scaling))
print("StandardScaler:", accuracy_score(y_test, yhat_standard))
print("MinMaxScaler:", accuracy_score(y_test, yhat_norm))

In [None]:
cm = confusion_matrix(y_test, yhat_standard)
print("Confusion Matrix:\n", cm)

print("\nClassification Report:")
print(classification_report(y_test, yhat_standard))

In [None]:
print("=== KNN sin escalado ===")
print(confusion_matrix(y_test, yhat_no_scaling))
print(classification_report(y_test, yhat_no_scaling))

print("=== KNN StandardScaler ===")
print(confusion_matrix(y_test, yhat_standard))
print(classification_report(y_test, yhat_standard))

print("=== KNN MinMaxScaler ===")
print(confusion_matrix(y_test, yhat_norm))
print(classification_report(y_test, yhat_norm))

In [None]:
k_values = range(1, 21)
accuracies = []

for k in k_values:
    pipe_norm = Pipeline([
        ("scaler", MinMaxScaler()),
        ("knn", KNeighborsClassifier(
            n_neighbors=k,
            algorithm="brute"
        ))
    ])

    pipe_norm.fit(X_train, y_train)
    y_pred = pipe_norm.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

In [None]:
best_k = k_values[accuracies.index(max(accuracies))]
best_acc = max(accuracies)

print(f"Mejor k: {best_k}")
print(f"Mejor accuracy: {best_acc:.4f}")