## Pair Programming - Regresión logística 6

### Random Forest

---

In [1]:
# Tratamiento de datos
import numpy as np
import pandas as pd
from tqdm import tqdm

# Gráficos
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score , cohen_kappa_score, roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

# Configuración warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../archivos/travel_balanceado_sin_dupl.csv', index_col=0)
df.head()

Unnamed: 0,product_name,net_sales,commision_(in_value),continent,cat_age,cat_duration,net_sales_stand,commision_(in_value)_stand,agency_ADM,agency_ART,...,agency_TST,agency_TTW,agency_type_Airlines,agency_type_Travel Agency,distribution_channel_Offline,distribution_channel_Online,gender_F,gender_M,gender_PNS,claim
0,12,108.9,65.34,4,5,8,1.310481,2.643544,0,0,...,0,0,0,1,0,1,0,0,1,0
1,7,56.5,14.13,6,4,8,0.27179,0.166822,0,0,...,0,0,1,0,0,1,1,0,0,0
2,14,24.0,0.0,6,5,4,-0.372436,-0.516562,0,0,...,0,0,0,1,0,1,0,0,1,0
3,14,25.0,0.0,6,6,6,-0.352614,-0.516562,0,0,...,0,0,0,1,0,1,0,0,1,0
4,12,0.0,23.76,6,6,4,-0.848173,0.632567,0,0,...,0,0,0,1,0,1,0,0,1,0


In [3]:
#Borramos las variables predictoras sin estandarizar
df_stand_bal = df.drop(['net_sales', 'commision_(in_value)'], axis= 1)

**Objetivos**:
### 1. Ajustad un modelo de Random Forest a nuestros datos.

In [4]:
# separamos los datos en X e y

X = df_stand_bal.drop("claim", axis = 1)
y = df_stand_bal["claim"]

In [5]:
# y dividir nuestros datos en train y test para poder evaluar la bondad de nuestro modelo

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
# definimos los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [4, 6, 8], 
        "max_features": [1,2,3,4],
        "min_samples_split": [30, 50, 100],
        "min_samples_leaf": [30,50,100]} 

In [7]:
gs_rf = GridSearchCV(
            estimator=RandomForestClassifier(random_state=42), 
            param_grid= param, 
            cv=10, 
            verbose=-1) 
        

In [8]:
gs_rf.fit(x_train, y_train)

In [9]:
# saquemos ahora cual es nuestro mejor bosque

bosque = gs_rf.best_estimator_
bosque

Prescindimos de sacar visualmente todos los árboles por rendimiento del ordenador.

In [10]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

### 2. Calculad las métricas a nuestro nuevo modelo.

In [11]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

In [12]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados

dt_results = metricas(y_test, y_pred_test_rf,y_train,  y_pred_train_rf, "Random Forest")
dt_results

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.774086,0.759477,0.692491,0.724439,0.533703,test,Random Forest
1,0.779631,0.7708,0.700809,0.73414,0.546694,train,Random Forest


Vemos en las métricas que el kappa

### 3. Comparad las métricas con los modelos hechos hasta ahora. ¿Cuál es mejor?

In [13]:
resultados_anteriores = pd.read_csv("../archivos/resultados_travel_log+DC.csv", index_col=0)

In [14]:
resultados_todo = pd.concat([resultados_anteriores, dt_results  ], axis = 0)
resultados_todo

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.730386,0.752636,0.553039,0.637582,0.43081,test,Regresión logistica Stan_Bal
1,0.718612,0.740689,0.541428,0.625574,0.408786,train,Regresión logistica Stan_Bal
0,0.985801,0.0,0.0,0.0,-0.00018,test,Regresión logistica Sin Stan-Bal
1,0.982635,0.0,0.0,0.0,-0.000135,train,Regresión logistica Sin Stan-Bal
0,0.73013,0.753257,0.551251,0.636614,0.430052,test,Regresión logistica Sin Stan Con Bal
1,0.718165,0.740032,0.540839,0.624947,0.407836,train,Regresión logistica Sin Stan Con Bal
0,0.985801,0.0,0.0,0.0,-0.00018,test,Regresión logistica Con Stan Sin Bal
1,0.982635,0.0,0.0,0.0,-0.000135,train,Regresión logistica Con Stan Sin Bal
0,0.890621,0.879248,0.863528,0.871317,0.776219,test,Decission Tree Stand_Bal I
1,0.977126,1.0,0.947314,0.972944,0.953158,train,Decission Tree Stand_Bal I
