### Algoritmo KNN

In [1]:
# Importaciones para manejo de datos y dataframes
import numpy as np
from numpy.random import seed
import pandas as pd

# Importaciones para manejo de archivos y llamadas al OS
import os as os
import warnings

# Importaciones para manejo de gráficos
import pylab as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

# Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor


from sklearn.model_selection import train_test_split


# No mostrar warnings de versiones anteriores
warnings.filterwarnings('ignore')

In [2]:
# Cargamos csv con los datos de train
df_train = pd.read_csv("../data_preprocess/train_preprocess.csv", sep=",", header=0, na_values=['?', '', 'NA'])
# Cargamos csv con los datos de test
df_test = pd.read_csv("../data_preprocess/test_preprocess.csv", sep=",", header=0, na_values=['?', '', 'NA'])

In [3]:
# Dimensión
print(f"Dimensión del dataset de train:\n {df_train.shape} \n")
print(f"Dimensión del dataset de test:\n {df_test.shape} \n")

# Descripción de las variables
print("Descripción de las variables:\n")
df_train.info()

Dimensión del dataset de train:
 (852, 41) 

Dimensión del dataset de test:
 (389, 40) 

Descripción de las variables:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852 entries, 0 to 851
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      852 non-null    int64  
 1   X1      852 non-null    float64
 2   X2      852 non-null    float64
 3   X3      852 non-null    float64
 4   X4      852 non-null    float64
 5   X5      852 non-null    float64
 6   X6      852 non-null    float64
 7   X7      852 non-null    float64
 8   X8      852 non-null    float64
 9   X9      852 non-null    float64
 10  X10     852 non-null    float64
 11  X11     852 non-null    float64
 12  X12     852 non-null    float64
 13  X13     852 non-null    float64
 14  X14     852 non-null    float64
 15  X15     852 non-null    float64
 16  X16     852 non-null    float64
 17  X17     852 non-null    float64
 18  X18     852 non-null    float6

In [4]:

print(df_train.groupby(['RATE'])['RATE'].count())
print("\n")
print(df_train.shape)

# Claramente la clase A desbalanceada. Haremos SMOTE seguramente.

RATE
A     68
B    361
C    224
D    199
Name: RATE, dtype: int64


(852, 41)


In [75]:
## HOLD OUT

#Por si acaso, vamos a cargar el conjunto de datos de cancer de mama de nuevo
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

X = df_train.drop(['ID','RATE'], inplace=False, axis=1)
y = df_train['RATE']

np.random.seed(1234)
#Vamos a dividir el conjunto de datos en un conjunto de entrenamiento y otro de test con un 20% de los datos para test y 80% para entrenamiento
#No queremos que los datos se desordenen respecto a su orden inicial, por lo que indicamos shuffle=false
#Ojo con el parámetro shuffle: https://scikit-learn.org/stable/modules/cross_validation.html#a-note-on-shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

knn = KNeighborsClassifier(n_neighbors=1)
#OJO: ahora entrenamos el modelo con los datos de entrenamiento
knn_model = knn.fit(X_train, y_train)

#Y ahora obtenemos las predicciones para los datos de test
knn_predictions = knn_model.predict(X_test)

#Vamos a mostrar las métricas de rendimiento sobre el conjuto de test Y_test
print("Informe completo\n",classification_report(y_test, knn_predictions))

Informe completo
               precision    recall  f1-score   support

           A       0.04      0.08      0.05        13
           B       0.45      0.35      0.40       114
           C       0.28      0.37      0.32        59
           D       0.33      0.30      0.32        70

    accuracy                           0.33       256
   macro avg       0.28      0.28      0.27       256
weighted avg       0.36      0.33      0.34       256



In [76]:
# Metemos un gridsearch
from sklearn.model_selection import GridSearchCV
param_grid_knn = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21],                                   
    'algorithm': ['ball_tree', 'kd_tree', 'brute', 'auto'],          
    'metric': ['minkowski', 'euclidean', 'manhattan', 'chebyshev']
}

X_1 = X[['X23','X36','X31','X14','X8','X26']].copy()

X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.25, random_state = 32, shuffle=False)

kNNModel_grid = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid_knn, verbose=1, cv=10, n_jobs=-1)
kNNModel_grid.fit(X_train, y_train)
print(kNNModel_grid.best_estimator_)

Fitting 10 folds for each of 176 candidates, totalling 1760 fits
KNeighborsClassifier(algorithm='ball_tree', metric='chebyshev', n_neighbors=21)


In [77]:
y_pred = kNNModel_grid.predict(X_test)
print(y_pred)

['C' 'B' 'B' 'C' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'C' 'D' 'D' 'D' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'C' 'B' 'C' 'B' 'B' 'B' 'D' 'B' 'B' 'D' 'C' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'D' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'D' 'C' 'A' 'D' 'C'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'D' 'B' 'B' 'B' 'B' 'B' 'D' 'B'
 'B' 'B' 'C' 'C' 'B' 'B' 'B' 'B' 'C' 'D' 'D' 'B' 'C' 'B' 'B' 'C' 'B' 'B'
 'C' 'B' 'B' 'B' 'D' 'D' 'C' 'C' 'D' 'B' 'D' 'D' 'C' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'D' 'C' 'C' 'B' 'B' 'D' 'B' 'C' 'D'
 'D' 'B' 'C' 'B' 'B' 'B' 'B' 'D' 'B' 'B' 'C' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'C' 'B' 'B' 'C' 'C' 'B' 'B' 'B' 'D' 'B' 'B' 'C' 'A' 'C' 'B' 'B' 'B'
 'B' 'D' 'B' 'D' 'B' 'B' 'C' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'C' 'C' 'C' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'D' 'B' 'D' 'B' 'B' 'D' 'B' 'B']


In [78]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred), ": is the confusion matrix")

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred), ": is the accuracy score")


[[ 0  8  2  1]
 [ 1 68 12 15]
 [ 1 33 10  7]
 [ 0 43  7  5]] : is the confusion matrix
0.38967136150234744 : is the accuracy score


In [55]:
# Selección de características con Random Forest seguro que va fetén

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

X = X[['X23','X36','X31','X14','X8','X26']]

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

rf = RandomForestClassifier(random_state=0)

rf.fit(X_train,y_train)

In [56]:
feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

feature_scores

X23    0.033838
X36    0.033544
X31    0.033279
X14    0.032514
X8     0.031968
X26    0.031802
X21    0.029678
X38    0.029364
X22    0.029057
X20    0.028884
X39    0.028868
X29    0.028537
X19    0.028281
X2     0.028130
X9     0.027865
X28    0.027277
X15    0.027255
X37    0.027137
X4     0.026969
X3     0.026252
X13    0.026083
X35    0.026076
X18    0.025814
X17    0.025629
X16    0.025597
X32    0.025495
X6     0.025445
X7     0.024816
X27    0.024798
X34    0.024503
X33    0.023692
X5     0.023443
X12    0.023197
X1     0.023141
X10    0.022414
X11    0.021689
X24    0.013374
X25    0.003522
X30    0.000771
dtype: float64