# Modelos de ML

En esta sección se cargaran todos los datos que han sido transformados y limpiados para realizar feature engineering, optimización de hiperparametros y reducción de dimensionalidad.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Carga de los datos

In [2]:
df = pd.read_csv("clean_data.csv", sep = ",")

In [3]:
df.shape

(422890, 75)

## Selección de modelos

Para este analisis se realizaran las comparaciones de performance utilizando 3 modelos:

1. Gradient Boosting tree
2. Random Forest
3. Support vector machine
4. Neural networks

In [4]:
X = df.drop(["gasto_familiar"], axis = 1)
Y = df["gasto_familiar"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

## Modelo sencillo para analisis de features importances

In [6]:
from sklearn.linear_model import ElasticNetCV

ENReg = ElasticNetCV(random_state = 0)
ENReg.fit(X_train, y_train)

ElasticNetCV(random_state=0)

In [10]:
def mean_absolute_percentage_error2(y_pred, y_true):
    y_true = np.where(y_true == 0, 0.0000000001, y_true)
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [8]:
y_pred = ENReg.predict(X_test)
y_train_pred = ENReg.predict(X_train)

In [9]:
ENReg.score(X_test, y_test)

0.05025316848875472

In [10]:
ENReg.score(X_train, y_train)

0.049176480140556755

In [11]:
mape = mean_absolute_percentage_error2(y_pred, y_test)
mape

98.142220058781

In [9]:
mape = mean_absolute_percentage_error2(y_train_pred, y_train)
mape

NameError: name 'mean_absolute_percentage_error2' is not defined

In [13]:
dft = df.sample(frac=0.20)

In [14]:
dft.shape

(84578, 75)

In [15]:
Xt = dft.drop(["gasto_familiar"], axis = 1)
yt = dft["gasto_familiar"]

In [16]:
Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt, yt, test_size=0.3)

In [17]:
ENRegt = ElasticNetCV(random_state = 0)
ENRegt.fit(Xt_train, yt_train)

ElasticNetCV(random_state=0)

In [18]:
ENRegt.score(Xt_train, yt_train)

0.05462604868592369

In [19]:
ENRegt.score(Xt_test, yt_test)

0.04640330702250495

In [20]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [21]:
reg = make_pipeline(StandardScaler(),
                    SGDRegressor(max_iter=1000, tol=1e-3))
reg.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor())])

In [22]:
reg.score(X_test, y_test)

-3615057140.789904

In [None]:
X_train, X_test, y_train, y_test 

In [12]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=10)
neigh.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=10)

In [13]:
y_pre = neigh.predict(X_test)

  "X does not have valid feature names, but"


In [14]:
mape = mean_absolute_percentage_error2(y_pre, y_test)
mape

86.03363405034301

In [15]:
new_df = df.sample(frac = 0.05)
Xn = new_df.drop(["gasto_familiar"], axis = 1)
yn = new_df["gasto_familiar"]

Xn_train, Xn_test, yn_train, yn_test = train_test_split(Xn, yn, test_size = 0.3)

In [22]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import ElasticNetCV

ENRegr = ElasticNetCV(random_state = 12)
scores = cross_validate(ENRegr, Xn, yn,return_train_score = True, cv=5)

In [23]:
scores

{'fit_time': array([0.50767851, 0.5270021 , 0.52464652, 0.51926494, 0.4890933 ]),
 'score_time': array([0.00557613, 0.00551629, 0.00563979, 0.00584531, 0.00527501]),
 'test_score': array([-0.44488124,  0.03290953,  0.04232167,  0.04415034,  0.04205877]),
 'train_score': array([0.05763469, 0.0414832 , 0.03717012, 0.03704063, 0.03970907])}

In [24]:
print(f"Entrenamiento: {np.mean(scores['train_score'])}")
print(f"Test: {np.mean(scores['test_score'])}")

Entrenamiento: 0.04260754132214981
Test: -0.05668818671829219


In [25]:
knn_scores = cross_validate(KNeighborsRegressor(n_neighbors=2), Xn, yn,return_train_score = True, cv=5)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [26]:
print(f"Entrenamiento: {np.mean(knn_scores['train_score'])}")
print(f"Test: {np.mean(knn_scores['test_score'])}")

Entrenamiento: 0.5622645734741077
Test: -0.31884324454531254


In [27]:
knn2_scores = cross_validate(KNeighborsRegressor(n_neighbors=2), Xn, yn,return_train_score = True, cv=5)
print(f"Entrenamiento: {np.mean(knn2_scores['train_score'])}")
print(f"Test: {np.mean(knn2_scores['test_score'])}")

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


Entrenamiento: 0.5622645734741077
Test: -0.31884324454531254


In [28]:
knn20_scores = cross_validate(KNeighborsRegressor(n_neighbors=20), Xn, yn,return_train_score = True, cv=5)
print(f"Entrenamiento: {np.mean(knn20_scores['train_score'])}")
print(f"Test: {np.mean(knn20_scores['test_score'])}")

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


Entrenamiento: 0.1492600061578848
Test: 0.06098105173951176


In [31]:
from sklearn.ensemble import GradientBoostingRegressor

params = {'n_estimators': 800, 'max_depth': 16,
          'loss': 'huber','alpha':0.95}

GBReg = GradientBoostingRegressor(**params)

In [32]:
gbt_scores = cross_validate(GBReg, Xn, yn,return_train_score = True, cv=5)
print(f"Entrenamiento: {np.mean(gbt_scores['train_score'])}")
print(f"Test: {np.mean(gbt_scores['test_score'])}")

Entrenamiento: 0.9997914594835071
Test: -0.011250145260617895


In [34]:
from sklearn.svm import SVR

In [35]:
svm_scores = cross_validate(SVR(C=1.0, epsilon=0.2, verbose=True), Xn, yn,return_train_score = True, cv=5)
print(f"Entrenamiento: {np.mean(svm_scores['train_score'])}")
print(f"Test: {np.mean(svm_scores['test_score'])}")

[LibSVM].........
*
optimization finished, #iter = 9506
obj = -6823207855.603618, rho = -428924.074863
nSV = 16914, nBSV = 16914
[LibSVM].........
*
optimization finished, #iter = 9547
obj = -6854443426.901639, rho = -430720.438374
nSV = 16914, nBSV = 16914
[LibSVM].........
*
optimization finished, #iter = 9539
obj = -6857333962.521228, rho = -429160.926645
nSV = 16914, nBSV = 16914
[LibSVM].........
*
optimization finished, #iter = 9578
obj = -6845934783.080253, rho = -433726.887896
nSV = 16914, nBSV = 16914
[LibSVM].........
*
optimization finished, #iter = 9525
obj = -6787644724.178686, rho = -428094.925806
nSV = 16916, nBSV = 16916
Entrenamiento: -0.10972346820284593
Test: -0.1099292147149041
