In [21]:
# Importar librerías

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
import pickle

In [22]:
df = pd.read_csv("../Desafio/datos-prediccion.csv", index_col="Unnamed: 0")

In [23]:
df

Unnamed: 0,PerfScoreID,EmpSatisfaction,Absences,Terminated
0,4,5,1,0
1,3,3,17,1
2,3,3,3,1
3,3,5,15,0
4,3,4,2,1
...,...,...,...,...
306,3,4,13,0
307,1,2,4,1
308,4,5,16,0
309,3,3,11,0


In [24]:
X = df[["EmpSatisfaction"]]
y = df["Terminated"]

In [25]:
X.shape

(311, 1)

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size= 0.20, random_state= 51)

In [27]:
# Construir pipeline y probar modelos

pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Primer modelo del pipeline
random_forest_params = {
    'regressor': [RandomForestRegressor()],
    'regressor__n_estimators': [30, 40, 45, 50, 100, 150, 200, 250,300,350,400],
    'regressor__max_depth': [4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20]
}     
                                                                          

#Segundo modelo del pipeline
linear_regressor_params = {
    'regressor': [LinearRegression()]
}

#Tercer modelo del pipeline
lasso_params = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'regressor': [Lasso()],
    'regressor__alpha': np.logspace(-4, 3, 100).tolist(),
    'regressor__max_iter': [50000, 100000, 200000]
}

#Cuarto modelo del pipeline
ridge_params = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'regressor': [Ridge()],
    'regressor__alpha': np.logspace(-4, 3, 100).tolist(),
    'regressor__max_iter': [50000, 100000, 200000]
} 

#Quinto modelo del pipeline
xgb_params = {
    'regressor': [XGBRegressor()],
    'regressor__n_estimators': [10,15,20, 25, 30, 35, 40, 45, 50, 60, 70, 75, 80, 90, 100, 110, 125],
    'regressor__max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
    'regressor__learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,] 
} 

#Sexto modelo del pipeline - regrasión polinomial
polinomial_params = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'regressor': [PolynomialFeatures(),],
    'regressor__degree': [2, 4, 6, 8]
}

#Lista de todos los clasificadores con sus parámetros
search_space = [linear_regressor_params, xgb_params, polinomial_params, random_forest_params, lasso_params, ridge_params
]

clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  n_jobs= 6,
                  cv = 5, 
                  scoring= 'neg_mean_absolute_error')

#Se entrena el gridsearch
clf.fit(X_train, Y_train)

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_

In [28]:
clf.best_params_

{'regressor': RandomForestRegressor(),
 'regressor__max_depth': 8,
 'regressor__n_estimators': 40}

In [29]:
print(mean_absolute_error(Y_train, clf.predict(X_train)))

0.43848079066272777


In [30]:
print(mean_absolute_error(Y_test, clf.predict(X_test)))

0.43280555282568195
