In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
df1 = pd.read_csv('entrenar2_st_1820.csv')
df2 = pd.read_csv('entrenar2_st_1921.csv')
df3 = pd.read_csv('entrenar2_st_2022.csv')
df4 = pd.read_csv('entrenar2_st_2123.csv')
archivos_st = [df1,df2,df3,df4]

In [3]:
def buscar_parametros_grd_st(grd):
    suma_errores = 0
    for df in archivos_st:
        X,y = df.drop(['ref_hash','segundos'],axis=1),df['segundos']
        X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, random_state=150)

        grd.fit(X_train,y_train)
        preds = grd.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        suma_errores += rmse
       
    error_promedio = suma_errores / len(archivos_st)
    print('RMSE promedio: %f' % (error_promedio))
    return (error_promedio)

In [None]:
GradientBoostingClassifier(loss=’deviance’, learning_rate=0.1,
                           n_estimators=100, subsample=1.0, criterion=’friedman_mse’,
                           min_samples_split=2, min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, max_depth=3,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           init=None, random_state=None, max_features=None, 
                           verbose=0, max_leaf_nodes=None, warm_start=False,
                           presort=’auto’, validation_fraction=0.1, 
                           n_iter_no_change=None, tol=0.0001)

In [6]:
errores = []
for v in [5,10,15]:
    grd = GradientBoostingRegressor(n_estimators=v)
    print('n_estimators: %f'% (v))
    error = buscar_parametros_grd_st(grd)
    errores.append([error,v])
error,v = min(errores,key=lambda x: x[0])
print('Error minimo %f con valor %f' %(error,v))

n_estimators: 5.000000
RMSE promedio: 92416.992743
n_estimators: 10.000000
RMSE promedio: 89674.462222
n_estimators: 15.000000
RMSE promedio: 88481.134365
Error minimo 88481.134365 con valor 15.000000


In [7]:
errores = []
for v in [5,10,12]:
    grd = GradientBoostingRegressor(n_estimators=15,max_features=v)
    print('max_features: %f'% (v))
    error = buscar_parametros_grd_st(grd)
    errores.append([error,v])
error,v = min(errores,key=lambda x: x[0])
print('Error minimo %f con valor %f' %(error,v))

max_features: 5.000000
RMSE promedio: 89139.458420
max_features: 10.000000
RMSE promedio: 88530.730908
max_features: 12.000000
RMSE promedio: 88477.671536
Error minimo 88477.671536 con valor 12.000000


In [8]:
errores = []
for v in [2,5,8,10,12]:
    grd = GradientBoostingRegressor(n_estimators=15, max_depth=v)
    print('max_depth: %f'% (v))
    error = buscar_parametros_grd_st(grd)
    errores.append([error,v])
error,v = min(errores,key=lambda x: x[0])
print('Error minimo %f con valor %f' %(error,v))

max_depth: 2.000000
RMSE promedio: 89354.150981
max_depth: 5.000000
RMSE promedio: 87676.750845
max_depth: 8.000000
RMSE promedio: 87326.362654
max_depth: 10.000000


KeyboardInterrupt: 

In [9]:
errores = []
for v in [0.1,0.2,0.3,0.4,0.5]:
    grd = GradientBoostingRegressor(n_estimators=15, max_depth=5, learning_rate=v)
    print('larning_rate: %f'% (v))
    error = buscar_parametros_grd_st(grd)
    errores.append([error,v])
error,v = min(errores,key=lambda x: x[0])
print('Error minimo %f con valor %f' %(error,v))

larning_rate: 0.100000
RMSE promedio: 87676.805376
larning_rate: 0.200000
RMSE promedio: 86913.208565
larning_rate: 0.300000
RMSE promedio: 86828.726305
larning_rate: 0.400000
RMSE promedio: 86848.368848
larning_rate: 0.500000
RMSE promedio: 86913.825871
Error minimo 86828.726305 con valor 0.300000


In [10]:
errores = []
for v in [2,4,6,8]:
    grd = GradientBoostingRegressor(n_estimators=15, max_depth=3, learning_rate=0.3,
                                   min_samples_split=v)
    print('min_sample_split: %f'% (v))
    error = buscar_parametros_grd_st(grd)
    errores.append([error,v])
error,v = min(errores,key=lambda x: x[0])
print('Error minimo %f con valor %f' %(error,v))

min_sample_split: 2.000000
RMSE promedio: 87236.358448
min_sample_split: 4.000000
RMSE promedio: 87236.358448
min_sample_split: 6.000000
RMSE promedio: 87236.358448
min_sample_split: 8.000000
RMSE promedio: 87236.358448
Error minimo 87236.358448 con valor 2.000000


In [12]:
errores = []
for v in [1,2,3,4]:
    grd = GradientBoostingRegressor(n_estimators=15, max_depth=3, learning_rate=0.3,
                                   min_samples_split=2, min_samples_leaf=v)
    print('min_sample_leaf: %f'% (v))
    error = buscar_parametros_grd_st(grd)
    errores.append([error,v])
error,v = min(errores,key=lambda x: x[0])
print('Error minimo %f con valor %f' %(error,v))

min_sample_leaf: 1.000000
RMSE promedio: 87236.358448
min_sample_leaf: 2.000000
RMSE promedio: 87236.358448
min_sample_leaf: 3.000000
RMSE promedio: 87236.358448
min_sample_leaf: 4.000000
RMSE promedio: 87236.358448
Error minimo 87236.358448 con valor 2.000000


In [13]:
errores = []
for v in [10,20,50,100,150]:
    grd = GradientBoostingRegressor(n_estimators=15, max_depth=3, learning_rate=0.3,
                                   min_samples_split=2, min_samples_leaf=2,
                                   random_state=v)
    print('random_state: %f'% (v))
    error = buscar_parametros_grd_st(grd)
    errores.append([error,v])
error,v = min(errores,key=lambda x: x[0])
print('Error minimo %f con valor %f' %(error,v))

random_state: 10.000000
RMSE promedio: 87236.358448
random_state: 20.000000
RMSE promedio: 87236.358448
random_state: 50.000000
RMSE promedio: 87236.358448
random_state: 100.000000
RMSE promedio: 87236.358448
random_state: 150.000000
RMSE promedio: 87236.358448
Error minimo 87236.358448 con valor 20.000000


In [15]:
errores = []
for v in [2,3,5,7,9]:
    grd = GradientBoostingRegressor(n_estimators=15, max_depth=3, learning_rate=0.3,
                                    min_samples_split=2, min_samples_leaf=2,
                                    max_leaf_nodes=v)
    print('max_leaf_nodes: %f'% (v))
    error = buscar_parametros_grd_st(grd)
    errores.append([error,v])
error,v = min(errores,key=lambda x: x[0])
print('Error minimo %f con valor %f' %(error,v))

max_leaf_nodes: 2.000000
RMSE promedio: 88913.242013
max_leaf_nodes: 3.000000
RMSE promedio: 87845.553493
max_leaf_nodes: 5.000000
RMSE promedio: 87457.166491
max_leaf_nodes: 7.000000
RMSE promedio: 87233.162206
max_leaf_nodes: 9.000000
RMSE promedio: 87122.759632
Error minimo 87122.759632 con valor 9.000000


In [18]:
grd = GradientBoostingRegressor(n_estimators=20, max_depth=5, learning_rate=0.3,
                                    min_samples_split=2, min_samples_leaf=2,
                                    max_leaf_nodes=10)
for df in archivos_st:
    X,y = df.drop(['ref_hash','segundos'],axis=1),df['segundos']
    grd.fit(X,y)
f = grd.feature_importances_

In [28]:
df5 = pd.read_csv('entrenar_sc_1820.csv')
df6 = pd.read_csv('entrenar_sc_1921.csv')
df7 = pd.read_csv('entrenar_sc_2022.csv')
df8 = pd.read_csv('entrenar_sc_2123.csv')
archivos_sc = [df5,df6,df7,df8]

In [29]:
grd_sc = GradientBoostingRegressor(n_estimators=20, max_depth=5, learning_rate=0.3,
                                    min_samples_split=2, min_samples_leaf=2,
                                    max_leaf_nodes=10)
suma_errores = 0
for df in archivos_sc:
    X,y = df.drop(['ref_hash','segundos'],axis=1),df['segundos']
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=150)

    grd_sc.fit(X_train,y_train)
    preds = grd_sc.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    suma_errores += rmse
       
error_promedio = suma_errores / len(archivos_st)
print('RMSE promedio: %f' % (error_promedio))

RMSE promedio: 54172.062058
