## Ejercicio de codificación #0505

### 1. Clasificación con árboles:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import metrics, preprocessing
from sklearn.datasets import load_boston
warnings.filterwarnings(action='ignore')                  # Desactiva las advertencias.
%matplotlib inline

#### 1.1. Leer los datos:

In [None]:
# Ir al directorio que contiene el fichero.
os.chdir(r'Data')                # Reemplazar por la ruta adecuada   

In [None]:
# Leer los datos preprocesados.
df = pd.read_csv('data_titanic_2.csv', header='infer')

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
X = df.drop(columns=['Survived'])
Y = df.Survived

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

#### 1.2. Optimización de hiperparámetros del árbol:

In [None]:
depth_grid = np.arange(1,21)
min_samples_leaf_grid = np.arange(10,31)
max_leaf_nodes_grid = np.arange(2,21)
parameters = {'max_depth':depth_grid, 'min_samples_leaf':min_samples_leaf_grid, 'max_leaf_nodes':max_leaf_nodes_grid}

In [None]:
gridCV = GridSearchCV(DecisionTreeClassifier(), parameters, cv=10, n_jobs = -1)        # estimator = una instancia de DecisionTreeClassifier.
gridCV.fit(X_train, Y_train);
best_depth = gridCV.best_params_['max_depth']
best_min_samples_leaf = gridCV.best_params_['min_samples_leaf']
best_max_leaf_nodes = gridCV.best_params_['max_leaf_nodes']

In [None]:
print("Tree best depth : " + str(best_depth))
print("Tree best min_samples_leaf : " + str(best_min_samples_leaf))
print("Tree best max_leaf_nodes : " + str(best_max_leaf_nodes))

In [None]:
DTC_best = DecisionTreeClassifier(max_depth=best_depth,min_samples_leaf=best_min_samples_leaf,max_leaf_nodes=best_max_leaf_nodes)
DTC_best.fit(X_train, Y_train);
Y_pred = DTC_best.predict(X_test)
print( "Tree best accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

### 2. Regresión con árboles:

#### 2.1. Leer los datos: 

In [None]:
data = load_boston()

In [None]:
# Mostrar la descripción de los datos.
print(data['DESCR'])

In [None]:
# Las variables explicatorias
X = data['data']
header = data['feature_names']

In [None]:
# La variable de respuesta.
# Esta es una variable numérica que representa el precio de la vivienda.
Y = data['target']
Y = Y.reshape(-1, 1)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

#### 2.2. Convertir los datos en un DataFrame y explorar:

In [None]:
df = pd.DataFrame(np.append(X,Y,axis = 1))
df.columns = list(header)+['PRICE']

In [None]:
df.head(5)

In [None]:
# Matriz de correlación por pares.
np.round(df.corr(),2)

#### 2.3. Optimización de hiperparámetros del árbol:

In [None]:
depth_grid = np.arange(1,21)
min_samples_leaf_grid = np.arange(10,31)
max_leaf_nodes_grid = np.arange(2,21)
parameters = {'max_depth':depth_grid, 'min_samples_leaf':min_samples_leaf_grid, 'max_leaf_nodes':max_leaf_nodes_grid}

In [None]:
gridCV = GridSearchCV(DecisionTreeRegressor(), parameters, cv=10, n_jobs = -1)      # estimator = una instancia de DecisionTreeRegressor.
gridCV.fit(X_train, Y_train)
best_depth = gridCV.best_params_['max_depth']
best_min_samples_leaf = gridCV.best_params_['min_samples_leaf']
best_max_leaf_nodes = gridCV.best_params_['max_leaf_nodes']

In [None]:
print("Tree best depth : " + str(best_depth))
print("Tree best min_samples_leaf : " + str(best_min_samples_leaf))
print("Tree best max_leaf_nodes : " + str(best_max_leaf_nodes))

In [None]:
DTR_best = DecisionTreeRegressor(max_depth=best_depth,min_samples_leaf=best_min_samples_leaf,max_leaf_nodes=best_max_leaf_nodes)
DTR_best.fit(X_train, Y_train)
Y_pred = DTR_best.predict(X_test)
print( "Tree best RMSE : " + str(np.round(np.sqrt(metrics.mean_squared_error(Y_test,Y_pred)),3)))

NOTA: Podemos comparar el resultado anterior con el obtenido usando regresión lineal donde el RMSE fue 5.33.