## Ejercicio de clasificación #0503

### 1. Clasificación con KNN:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, preprocessing
warnings.filterwarnings(action='ignore')                  # Desactiva las advertencias.
%matplotlib inline

#### 1.1. Leer los datos:

Los datos y la explicación se pueden encontrar [aquí](https://www.kaggle.com/c/titanic/data) (requiere registrarse).

In [None]:
# Ir al directorio dónde se encuentra el fichero 
os.chdir(r'Data')                # Reemplazar por la ruta adecuada.   

In [None]:
df = pd.read_csv('data_titanic.csv', header='infer')

In [None]:
df.shape

In [None]:
df.head(3)

#### 1.2. Procesamiento de valores faltantes: 

In [None]:
# Comprueba los valores faltantes.
df.isnull().sum(axis=0)

In [None]:
# Completa los valores faltantes en la variable Age.
n = df.shape[0]
Age = []                                                               # Una lista temporal.
for i in range(n):
    if np.isnan(df.Age[i]):
        if ('Mr' in df.Name[i]) or ('Mrs' in df.Name[i]) :
            Age.append(30)                                             # Si el nombre es Sr. o Sra., Completa con 30.
        else:
            Age.append(10)                                             # Probablemente un niño. Entonces, rellena con 10.
    else:
        Age.append(df.Age[i])
df.Age = pd.Series(Age)

In [None]:
# Eliminamos algunas columnas.
df = df.drop(columns = ['PassengerId','Name','Ticket','Fare','Cabin'])
df.head(3)

In [None]:
# Borramos el resto de los valores faltantes.
df=df.dropna(axis=0)
df.shape

In [None]:
df.shape

#### 1.3. Análisis exploratorio de datos:

In [None]:
# Tabla de frecuencia de Survived.
sns.countplot('Survived',data=df)
plt.show()

In [None]:
# Ratio de Survived por AgeCategory. 
df['AgeCategory'] = pd.qcut(df.Age,4)                   # Usando cuantiles cortados en 4 intervalos.
sns.barplot(x='AgeCategory',y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Ratio de Survival por SibSp. 
sns.barplot(x='SibSp', y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Ratio de Survival por Parch.
sns.barplot(x='Parch', y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Ratio de Survival por Pclass.
sns.barplot(x='Pclass', y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Ratio de Survival por Embarked.
sns.barplot(x='Embarked', y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Ratio de Survival por Sex.
sns.barplot(x='Sex', y='Survived', ci=None,  data=df)
plt.show()

#### 1.4. Ingeniería de características:

In [None]:
# Convertit en variables ficticias y eliminar las variables originales.
df = pd.get_dummies(df.AgeCategory, drop_first=True,prefix='Age').join(df.drop(columns=['Age','AgeCategory']))
df = pd.get_dummies(df.Pclass, drop_first=True,prefix='Pclass').join(df.drop(columns=['Pclass']))
df = pd.get_dummies(df.SibSp, drop_first=True,prefix='SibSp').join(df.drop(columns=['SibSp']))
df = pd.get_dummies(df.Parch, drop_first=True,prefix='Parch').join(df.drop(columns=['Parch']))
df = pd.get_dummies(df.Sex, drop_first=True,prefix='Sex').join(df.drop(columns=['Sex']))
df = pd.get_dummies(df.Embarked, drop_first=True,prefix='Embarked').join(df.drop(columns=['Embarked']))
df.head(5)

In [None]:
# Salvar en un fichero externo
df.to_csv('data_titanic_2.csv',index=False)

#### 1.5. KNN entrenamiento y test:

In [None]:
X = df.drop(columns=['Survived'])
Y = df.Survived

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
# KNN con n_neighbours = 5
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, Y_train);
Y_pred = knn5.predict(X_test)
print(metrics.confusion_matrix(Y_test,Y_pred))
print("------------------------")
print( "Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

In [None]:
# KNN con n_neighbours = 100
knn100 = KNeighborsClassifier(n_neighbors=100)
knn100.fit(X_train, Y_train);
Y_pred = knn100.predict(X_test)
print(metrics.confusion_matrix(Y_test,Y_pred))
print("------------------------")
print( "Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

#### 1.6. Compensación de sesgo-varianza KNN en función de *k*:

In [None]:
accs = []
k_grid = range(1,100,1)
for k in k_grid:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    Y_pred = knn.predict(X_test)
    accs.append(metrics.accuracy_score(Y_test,Y_pred))

In [None]:
# Visualizar.
plt.scatter(k_grid,accs,c='red',marker='o',s=10,alpha=0.6)
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.title('Accuracy vs k')
plt.show()

#### 1.7. KNN optimización de hiperparámetros:

In [None]:
# Cuadrícula de parámetros.
k_grid = np.arange(1,51,1)
parameters = {'n_neighbors':k_grid}

In [None]:
# Optimizar la k.
gridCV = GridSearchCV(KNeighborsClassifier(), parameters, cv=10, n_jobs = -1)       # "n_jobs = -1" means "use all the CPU cores".
gridCV.fit(X_train, Y_train)
best_k = gridCV.best_params_['n_neighbors']
print("Best k : " + str(best_k))

In [None]:
# Test con la mejor k.
KNN_best = KNeighborsClassifier(n_neighbors=best_k)
KNN_best.fit(X_train, Y_train)
Y_pred = KNN_best.predict(X_test)
print( "Best Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))