In [2]:
import pandas as pd
from pycaret.classification import *
import pycaret.regression as rc
from pandas.errors import DataError
from pycaret.regression import *

In [3]:
from pydantic import AnyUrl

assert str(AnyUrl(url='https://google.com')) == 'https://google.com/'
assert str(AnyUrl(url='https://google.com/')) == 'https://google.com/'
assert str(AnyUrl(url='https://google.com/api')) == 'https://google.com/api'
assert str(AnyUrl(url='https://google.com/api/')) == 'https://google.com/api/'

In [4]:
from pydantic import PydanticUserError, create_model

try:
    create_model('FooModel', foo=(str, 'default value', 'more'))
except PydanticUserError as exc_info:
    assert exc_info.code == 'create-model-field-definitions'


In [5]:
data_traffic = pd.read_excel('videojuegos.xlsx')

In [6]:
type(data_traffic)

pandas.core.frame.DataFrame

In [7]:
data_traffic.head()

Unnamed: 0,nombre,plataforma,anio,genero,editorial,ventasNA,ventasEU,ventasJP,ventasOtros,ventas_global
0,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [8]:
data_traffic.to_csv('../data/raw/data_traffic_v1.csv',index=False)

In [9]:
num_filas_train =int(len(data_traffic)*0.7)

data_train = data_traffic.sample(n=num_filas_train, random_state=2023)
data_test = data_traffic.drop(data_train.index)

In [10]:
print(f"Filas en Train:{data_train. shape[0]}")
print(f"Filas en Test:{data_test.shape[0]}")

Filas en Train:11428
Filas en Test:4898


In [11]:
dataset = setup(data=data_train, 
                target='ventas_global',
                session_id=2023,
                normalize = True,
                normalize_method = 'minmax',
                transformation = True )

Unnamed: 0,Description,Value
0,Session id,2023
1,Target,ventas_global
2,Target type,Regression
3,Original data shape,"(11428, 10)"
4,Transformed data shape,"(11428, 21)"
5,Transformed train set shape,"(7999, 21)"
6,Transformed test set shape,"(3429, 21)"
7,Numeric features,5
8,Categorical features,4
9,Rows with missing values,0.2%


### Entrenamiento de modelos ganadores

In [12]:
best = compare_models(sort='RMSE')


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.081,0.1053,0.3082,0.9423,0.0655,0.2591,0.496
gbr,Gradient Boosting Regressor,0.1112,0.1206,0.3221,0.937,0.0761,1.1746,0.966
rf,Random Forest Regressor,0.1228,0.2267,0.4597,0.8721,0.1014,0.4115,2.292
dt,Decision Tree Regressor,0.1357,0.3493,0.5511,0.8138,0.1192,0.344,0.285
et,Extra Trees Regressor,0.1327,0.4172,0.6147,0.777,0.1036,0.4602,1.702
knn,K Neighbors Regressor,0.199,0.7805,0.8542,0.5637,0.153,0.4788,0.357
ada,AdaBoost Regressor,0.7992,0.8691,0.9281,0.4329,0.5226,13.8146,0.703
lar,Least Angle Regression,0.4126,1.0356,0.9872,0.4226,0.2519,4.0057,0.305
br,Bayesian Ridge,0.4084,1.0362,0.9872,0.4228,0.25,3.811,0.296
ridge,Ridge Regression,0.4118,1.0357,0.9872,0.4226,0.2515,3.9682,0.281


In [13]:
print(best)

LGBMRegressor(n_jobs=-1, random_state=2023)


In [14]:
turned_best = tune_model(estimator= best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1026,0.1161,0.3408,0.9348,0.0789,0.5755
1,0.0842,0.0389,0.1973,0.9515,0.0691,0.6525
2,0.1113,0.3183,0.5641,0.889,0.0757,0.6821
3,0.0879,0.072,0.2682,0.9597,0.0662,0.59
4,0.0937,0.1188,0.3447,0.9477,0.0627,0.5626
5,0.0909,0.1801,0.4244,0.9154,0.0591,0.4503
6,0.0841,0.1676,0.4094,0.871,0.0611,0.5805
7,0.083,0.0541,0.2326,0.9208,0.071,0.849
8,0.1024,0.0952,0.3086,0.9394,0.0706,0.7897
9,0.0828,0.066,0.2569,0.9673,0.0582,0.5617


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [28]:
save_model(model=finalize_model, model_name='../models/model_v1')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['anio', 'ventasNA', 'ventasEU',
                                              'ventasJP', 'ventasOtros'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['nombre', 'plataforma', 'genero',
                                              'editorial'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('onehot_encoding',
                  Tra...
                  TransformerWrapper(include=['nombre', 'plataforma',
                                              'editorial'],
                                     transformer=TargetEncoder(cols=['nombre',
                                                                     'plataforma',
                                                                     'edit

### Crear segundo modelo utilizando GBR

In [36]:
# Crear un modelo
gbra = create_model('gbr')

modelo2 = tune_model(estimator= gbra)

# Guardar el modelo
save_model(model=modelo2, model_name='../models/model_v2')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1247,0.1016,0.3188,0.9429,0.0849,1.3775
1,0.093,0.0344,0.1854,0.9572,0.0667,0.9812
2,0.1437,0.4279,0.6541,0.8508,0.0895,1.1587
3,0.1059,0.0592,0.2434,0.9668,0.0742,1.1005
4,0.1092,0.1507,0.3882,0.9337,0.0665,0.8702
5,0.1086,0.1169,0.3419,0.9451,0.0733,1.1315
6,0.1046,0.1236,0.3516,0.9049,0.0758,1.2905
7,0.0995,0.0266,0.1632,0.961,0.0792,1.4009
8,0.1084,0.0783,0.2798,0.9502,0.073,1.0973
9,0.1142,0.087,0.295,0.9569,0.0776,1.3379


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1193,0.1131,0.3363,0.9365,0.1008,0.7552
1,0.1038,0.0865,0.2941,0.8923,0.0955,0.5788
2,0.1299,0.3349,0.5787,0.8832,0.0966,0.4429
3,0.1109,0.0967,0.311,0.9458,0.0899,0.6769
4,0.1114,0.1025,0.3201,0.9549,0.0926,0.6064
5,0.0961,0.161,0.4012,0.9244,0.0715,0.5377
6,0.0974,0.1423,0.3772,0.8905,0.0793,0.65
7,0.0904,0.0548,0.2341,0.9198,0.0877,0.6218
8,0.091,0.0741,0.2723,0.9529,0.0728,0.5739
9,0.12,0.099,0.3147,0.9509,0.1005,0.9658


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['anio', 'ventasNA', 'ventasEU',
                                              'ventasJP', 'ventasOtros'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['nombre', 'plataforma', 'genero',
                                              'editorial'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('onehot_encoding',
                  Tra...
                  TransformerWrapper(include=['nombre', 'plataforma',
                                              'editorial'],
                                     transformer=TargetEncoder(cols=['nombre',
                                                                     'plataforma',
                                                                     'edit

### Generar el tercer modelo utilizando DT

In [None]:
# Crear un modelo
dt = create_model('dt')

modelo3 = tune_model(estimator= dt)

# Guardar el modelo
save_model(model=modelo3, model_name='../models/model_v3')

### Predicciones

In [17]:
model = load_model('../models/model_v1')

Transformation Pipeline and Model Successfully Loaded


In [42]:
model

In [None]:
predicciones = predict_model(model, data = data_test)
predicciones

In [44]:
gbr= create_model('gbr')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1247,0.1016,0.3188,0.9429,0.0849,1.3775
1,0.093,0.0344,0.1854,0.9572,0.0667,0.9812
2,0.1437,0.4279,0.6541,0.8508,0.0895,1.1587
3,0.1059,0.0592,0.2434,0.9668,0.0742,1.1005
4,0.1092,0.1507,0.3882,0.9337,0.0665,0.8702
5,0.1086,0.1169,0.3419,0.9451,0.0733,1.1315
6,0.1046,0.1236,0.3516,0.9049,0.0758,1.2905
7,0.0995,0.0266,0.1632,0.961,0.0792,1.4009
8,0.1084,0.0783,0.2798,0.9502,0.073,1.0973
9,0.1142,0.087,0.295,0.9569,0.0776,1.3379


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [45]:
print(gbr)

GradientBoostingRegressor(random_state=2023)


In [46]:
params = {
    'n_estimators':[50,100],
    'learning_rate':[0.01, 0.1] 
}

tuned_gbr_model = tune_model(
    estimator = gbr,
    optimize='RMSE',
    fold=10,
    n_iter = 10,
    custom_grid = params
)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1247,0.1016,0.3188,0.9429,0.0849,1.3775
1,0.093,0.0344,0.1854,0.9572,0.0667,0.9812
2,0.1437,0.4279,0.6541,0.8508,0.0895,1.1587
3,0.1059,0.0592,0.2434,0.9668,0.0742,1.1005
4,0.1092,0.1507,0.3882,0.9337,0.0665,0.8702
5,0.1086,0.1169,0.3419,0.9451,0.0733,1.1315
6,0.1046,0.1236,0.3516,0.9049,0.0758,1.2905
7,0.0995,0.0266,0.1632,0.961,0.0792,1.4009
8,0.1084,0.0783,0.2798,0.9502,0.073,1.0973
9,0.1142,0.087,0.295,0.9569,0.0776,1.3379


Fitting 10 folds for each of 4 candidates, totalling 40 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
