# Modelos de ML

En esta sección se cargaran todos los datos que han sido transformados y limpiados para realizar feature engineering, optimización de hiperparametros y reducción de dimensionalidad.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Carga de los datos

In [2]:
df = pd.read_csv("dataClean_new.csv", sep = ",")

In [3]:
df.head()

Unnamed: 0,fecha_nacimiento,genero,ult_actual,ind_mora_vigente,cupo_total_tc,tenencia_tc,tiene_ctas_activas,ingreso_final,saldo_no_rot_mdo,cant_oblig_tot_sf,...,ocupacion_jubilado,ocupacion_otro,ocupacion_pensionado,ocupacion_profesional independiente,ocupacion_rentista de capital,ocupacion_socio empleado - socio,tipo_vivienda_familiar,tipo_vivienda_no informa,tipo_vivienda_propia,cuotas_a_pagar
0,19840630,1,20180526,0,0.0,0,1,1391032.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.0
1,19860727,1,20181120,0,0.0,0,1,2327500.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0.0
2,19910108,1,20190802,0,0.0,1,1,6519750.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0.0
3,19900903,1,20190906,0,0.0,0,1,1484205.0,2555000.0,0.0,...,0,0,0,0,0,0,0,1,0,0.0
4,19790623,0,20191211,0,0.0,0,1,4353334.0,211000.0,4.0,...,0,0,0,0,0,0,0,1,0,0.0


## Selección de modelos

Para este analisis se realizaran las comparaciones de performance utilizando 3 modelos:

1. Gradient Boosting tree
2. Random Forest
3. Support vector machine
4. Neural networks

In [4]:
X = df.drop(["gasto_familiar"], axis = 1)
Y = df["gasto_familiar"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=7)

## Modelo sencillo para analisis de features importances

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

params = {'n_estimators': 500, 'max_depth': 6, 'verbose': 2,
          'loss': 'huber','alpha':0.95}

GBReg = GradientBoostingRegressor(**params).fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1 155370963185.2104           13.50m
         2 150781478028.7314           13.37m
         3 147005871287.8783           13.25m
         4 143930203500.4450           13.26m
         5 141381866636.9082           13.19m
         6 139279041226.4739           13.14m


KeyboardInterrupt: 

In [None]:
GBReg.score(X_test, y_test)

In [None]:
y_pred = GBReg.predict(X_test)

In [11]:
def mean_absolute_percentage_error2(y_pred, y_true):
    y_true = np.where(y_true == 0, 0.0000000001, y_true)
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [10]:
mape = mean_absolute_percentage_error2(y_pred, y_test)*100
mape

10924.249763421274

In [7]:
from sklearn.neural_network import MLPRegressor

regr = MLPRegressor(hidden_layer_sizes = (240,120), 
                    random_state=1, 
                    max_iter=100,
                    verbose = True).fit(X_train, y_train)

Iteration 1, loss = 181593466017.66754150
Iteration 2, loss = 144378285835.92422485
Iteration 3, loss = 141545698067.29760742
Iteration 4, loss = 139272198748.98709106
Iteration 5, loss = 138062746848.79284668
Iteration 6, loss = 137669955182.90853882
Iteration 7, loss = 137078047557.33184814
Iteration 8, loss = 137036776110.90684509
Iteration 9, loss = 136849667262.85656738
Iteration 10, loss = 136870261328.87008667
Iteration 11, loss = 136741494803.17842102
Iteration 12, loss = 136688861308.05090332
Iteration 13, loss = 136648078964.46640015
Iteration 14, loss = 136608601943.10671997
Iteration 15, loss = 136589059458.59761047
Iteration 16, loss = 136571648607.83276367
Iteration 17, loss = 136532857585.88777161
Iteration 18, loss = 136531019664.60484314
Iteration 19, loss = 136480549918.61094666
Iteration 20, loss = 136500479249.21253967
Iteration 21, loss = 136482318568.03854370
Iteration 22, loss = 136491724624.72280884
Iteration 23, loss = 136501027502.71658325
Iteration 24, loss =

In [8]:
y_pred4 = regr.predict(X_test)

In [9]:
regr.score(X_test, y_test)

0.09670213228528213

In [12]:
mape_nn = mean_absolute_percentage_error2(y_pred4, y_test)
mape_nn

117.25899198226054