#### Data import

In [1]:
import pandas as pd

train_df = pd.read_csv('../data/cleaned_train.csv')
test_df = pd.read_csv('../data/cleaned_test.csv')

#### Pycaret setup

- Remove outliers
- Ignore low variance features
- Power feature to make data more gaussian like
- Normalize data
- Select best feature by importance for the model
- Log experiment on features

In [2]:
from pycaret.regression import *
s = setup(train_df, test_data = test_df, remove_outliers=True, ignore_low_variance=True, transformation=True, normalize=True, feature_selection=True, log_experiment = True, target = 'count')

Unnamed: 0,Description,Value
0,session_id,1970
1,Target,count
2,Original Data,"(8607, 14)"
3,Missing Values,0
4,Numeric Features,5
5,Categorical Features,8
6,Ordinal Features,0
7,High Cardinality Features,0
8,High Cardinality Method,
9,Transformed Train Set,"(8176, 46)"


#### Model comparison

In [3]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,30.8267,2429.7518,46.0916,0.8943,0.439,0.4755,0.157
xgboost,Extreme Gradient Boosting,35.0417,3117.4985,51.9022,0.8615,0.5171,0.5471,0.595
rf,Random Forest Regressor,33.2972,2978.9198,51.4218,0.855,0.3906,0.4013,0.96
et,Extra Trees Regressor,34.1742,3181.7927,53.3152,0.8362,0.4162,0.446,0.969
gbr,Gradient Boosting Regressor,45.9657,4540.9911,63.6019,0.791,0.6616,0.9503,0.32
dt,Decision Tree Regressor,42.4719,4874.3324,66.3246,0.758,0.4927,0.4481,0.041
knn,K Neighbors Regressor,77.5839,13084.7536,109.4788,0.4013,0.9772,2.19,0.097
lasso,Lasso Regression,89.8915,15028.9826,118.4486,0.2339,1.1338,3.1043,0.022
en,Elastic Net,95.9315,16728.9775,123.6451,0.2283,1.2179,3.7217,0.02
omp,Orthogonal Matching Pursuit,91.4817,15654.3738,121.003,0.2006,1.1595,3.1561,0.022


#### Model evaluation

In [4]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

#### Model saving

In [5]:
save_model(best, 'lgbm_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='count',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy='...
                  LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                colsample_bytree=1.0, importance_type='split',
                                learning_rate=0.1, max_depth=-1,
                                min_child_samples=20, min_child_weight=0.001,
                  

#### Just a test with a pre-tuned model by picaret but, not used

In [6]:
tuned_model = tune_model(best)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,19.8276,691.6503,26.2992,0.7806,0.6045,1.0684
1,26.4013,1288.1834,35.8913,0.8341,0.612,0.989
2,32.1288,2004.7607,44.7746,0.905,0.5194,0.56
3,30.6126,1788.6304,42.2922,0.923,0.4374,0.4294
4,31.2661,1865.0016,43.1857,0.9169,0.4576,0.4408
5,29.735,1722.2407,41.4999,0.9021,0.5718,0.7652
6,26.9223,1546.9043,39.3307,0.8936,0.5451,0.6821
7,49.2284,5790.7151,76.0967,0.8317,0.5378,0.6986
8,53.5653,6009.5002,77.521,0.8716,0.5923,0.7373
9,69.2522,8065.4575,89.8079,0.8296,0.6684,0.8883


In [7]:
save_model(tuned_model, 'tuned_lgbm_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='count',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy='...
                                colsample_bytree=1.0, feature_fraction=0.4,
                                importance_type='split', learning_rate=0.05,
                                max_depth=-1, min_child_samples=16,
                                min_child_weight=0.001, min_split_gain=0.6,
                

##### Best models seems to be XGBoost and LightGBM so generally Gradient boosting methods, i'll try those two by myself