In [1]:
from idstools._config import load_config
from idstools._objects import Target
from idstools.preparation import DataPreparation
from idstools.optimization import ModelOptimization

In [2]:
# Load configuration
config = load_config("../config/idstools/TestModel.yaml")

In [3]:
# Train Data
id = 1
input_path = "data/BikeRentalDaily_train.csv"
input_delimiter = ";"
label = "cnt"
index = "instant"

In [4]:
# Initialize Train Target
train_target = Target(id, input_path, input_delimiter, label, index)

2024-03-02 13:45:40,065 [_objects] [INFO] - Initializing TargetData object.
2024-03-02 13:45:40,080 [_helpers] [INFO] - Reading data from:
/home/davidrmn/Studies/introduction-data-science/data/BikeRentalDaily_train.csv
2024-03-02 13:45:40,098 [_objects] [INFO] - Using id: 1
2024-03-02 13:45:40,101 [_objects] [INFO] - Using label: cnt
2024-03-02 13:45:40,107 [_objects] [INFO] - Using index: instant
2024-03-02 13:45:40,108 [_objects] [INFO] - No features provided.
2024-03-02 13:45:40,109 [_objects] [INFO] - No environment name provided.
Using default environment name: SELF_EXECUTED
2024-03-02 13:45:40,110 [_objects] [INFO] - No step name provided.
2024-03-02 13:45:40,111 [_objects] [INFO] - Output path not provided.
Using default output path: /home/davidrmn/Studies/introduction-data-science/results


In [5]:
# Test Data
id = 2
input_path = "data/BikeRentalDaily_test.csv"
input_delimiter = ";"
label = "cnt"
index = "instant"

In [6]:
# Initialize Test Target
test_target = Target(id, input_path, input_delimiter, label, index)

2024-03-02 13:45:40,155 [_objects] [INFO] - Initializing TargetData object.
2024-03-02 13:45:40,164 [_helpers] [INFO] - Reading data from:
/home/davidrmn/Studies/introduction-data-science/data/BikeRentalDaily_test.csv
2024-03-02 13:45:40,173 [_objects] [INFO] - Using id: 2
2024-03-02 13:45:40,176 [_objects] [INFO] - Using label: cnt
2024-03-02 13:45:40,181 [_objects] [INFO] - Using index: instant
2024-03-02 13:45:40,182 [_objects] [INFO] - No features provided.
2024-03-02 13:45:40,183 [_objects] [INFO] - No environment name provided.
Using default environment name: SELF_EXECUTED
2024-03-02 13:45:40,185 [_objects] [INFO] - No step name provided.
2024-03-02 13:45:40,188 [_objects] [INFO] - Output path not provided.
Using default output path: /home/davidrmn/Studies/introduction-data-science/results


In [7]:
# Preprocessing Pipeline
pipeline = config.default.preprocessing.preparation.DataPreparation.pipeline

In [8]:
# Initialize DataPreparation 
preprcessing_pipeline = DataPreparation([train_target, test_target], pipeline=pipeline)

2024-03-02 13:45:40,518 [preparation] [INFO] - Initializing DataPreparation for multiple targets.


In [9]:
train_target.data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season,538.0,2.436803,1.10724,1.0,1.0,2.0,3.0,4.0
yr,600.0,0.496667,0.500406,0.0,0.0,0.0,1.0,1.0
mnth,600.0,6.475,3.442741,1.0,4.0,6.0,9.0,12.0
holiday,600.0,0.03,0.17073,0.0,0.0,0.0,0.0,1.0
weekday,600.0,2.928333,2.07936,-1.0,1.0,3.0,5.0,6.0
workingday,600.0,0.68,0.466865,0.0,0.0,1.0,1.0,1.0
weathersit,600.0,1.401667,0.54236,1.0,1.0,1.0,2.0,3.0
temp,600.0,19.808284,7.206843,2.3652,13.575,20.1,26.0583,34.4667
atemp,600.0,0.47362,0.160439,0.07907,0.338256,0.48969,0.605127,0.826371
hum,566.0,93.8452,21.380635,28.18755,77.32815,93.2568,109.921913,145.875


In [10]:
# Run DataPreparation
preprcessing_pipeline.run()

2024-03-02 13:45:40,727 [preparation] [INFO] - Pipeline created:
_CustomTransformer:
- func: target_to_datetime
  module: idstools._transformer
  config:
    target: dteday
    format: '%d.%m.%Y'
- func: impute_season
  module: idstools._transformer
  config:
    target: season
    date: dteday
- func: negative_to_nan
  module: idstools._transformer
  config:
    target: windspeed
- func: process_weekday
  module: idstools._transformer
  config:
    target: weekday
    date: dteday
- func: remove_outliers
  module: idstools._transformer
  config:
    target: cnt
_OneHotEncoder:
- target: mnth
  config:
    prefix: mnth
    dtype: int
    drop_first: true
- target: weekday
  config:
    prefix: weekday
    dtype: int
    drop_first: true
- target: weathersit
  config:
    prefix: weathersit
    dtype: int
    drop_first: true
_SimpleImputer:
- target:
  - hum
  - windspeed
  config:
    strategy: mean
_FeatureDropper:
- target:
  - casual
  - registered
  - temp
  - dteday
  - workingda

In [11]:
train_target.processed_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season,596.0,2.459732,1.112219,1.0,1.0,2.0,3.0,4.0
yr,596.0,0.493289,0.500375,0.0,0.0,0.0,1.0,1.0
holiday,596.0,0.030201,0.171285,0.0,0.0,0.0,0.0,1.0
atemp,596.0,0.472884,0.160718,0.07907,0.337891,0.487364,0.605127,0.826371
hum,596.0,93.899592,20.789893,28.18755,78.468712,93.899592,108.43755,145.875
windspeed,596.0,0.1917,0.079144,0.022392,0.134954,0.182217,0.233376,0.507463
leaflets,596.0,739.41443,164.049903,459.0,600.75,736.0,875.0,1032.0
price reduction,596.0,0.149329,0.356712,0.0,0.0,0.0,0.0,1.0
cnt,596.0,4461.520134,1905.340166,22.0,3111.5,4510.0,5879.25,8227.0
mnth_2,596.0,0.082215,0.274922,0.0,0.0,0.0,0.0,1.0


In [12]:
# Model Optimization
pipeline = config.default.Evaluation.optimization.ModelOptimization.pipeline

In [13]:
# Initialize ModelOptimization
optimization_pipeline = ModelOptimization([train_target, test_target], pipeline=pipeline)

2024-03-02 13:45:41,232 [optimization] [INFO] - Initializing ModelOptimization


In [14]:
optimization_pipeline.run()

2024-03-02 13:45:41,262 [optimization] [INFO] - Running model optimization pipeline.
2024-03-02 13:45:41,267 [optimization] [INFO] - Retrieving model RandomForestRegressor with param grid {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'random_state': [42]}.
2024-03-02 13:45:41,390 [optimization] [INFO] - Retrieving model XGBRegressor with param grid {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5], 'learning_rate': [0.1, 0.01, 0.001], 'random_state': [42]}.
2024-03-02 13:45:41,461 [optimization] [INFO] - Retrieving model ExtraTreesRegressor with param grid {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'random_state': [42]}.
2024-03-02 13:45:41,463 [optimization] [INFO] - Retrieving model LinearRegression with param grid {'fit_intercept': [True, False], 'copy_X': [True,

QuantileRegressor model failed to execute
Solver interior-point is not anymore available in SciPy >= 1.11.0.


100%|██████████| 42/42 [00:53<00:00,  1.27s/it]
2024-03-02 13:46:36,025 [optimization] [INFO] - Prepare validation data BikeRentalDaily_test for LazyRegressor.
2024-03-02 13:46:36,028 [optimization] [INFO] - Predicting target BikeRentalDaily_test for LazyRegressor.
2024-03-02 13:46:36,030 [optimization] [INFO] - No predict method found for model LazyRegressor.
2024-03-02 13:46:36,047 [optimization] [INFO] - Retrieving model LinearRegression.
2024-03-02 13:46:36,048 [optimization] [INFO] - Preparing target for LinearRegression.
2024-03-02 13:46:36,054 [optimization] [INFO] - Performed splitting for BikeRentalDaily_train
Training set: 476 samples
Testing set: 120 samples
2024-03-02 13:46:36,055 [optimization] [INFO] - Fitting BikeRentalDaily_train for LinearRegression.
2024-03-02 13:46:36,063 [optimization] [INFO] - Saving model to models/LinearRegression.pkl.
2024-03-02 13:46:36,068 [optimization] [INFO] - Saving features to models/LinearRegression_features.pkl.
2024-03-02 13:46:36,081 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000278 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 689
[LightGBM] [Info] Number of data points in the train set: 476, number of used features: 25
[LightGBM] [Info] Start training from score 4459.361345


2024-03-02 13:47:52,305 [optimization] [INFO] - Prepare validation data BikeRentalDaily_test for GridSearchCV.
2024-03-02 13:47:52,307 [optimization] [INFO] - Predicting target BikeRentalDaily_test for GridSearchCV.


In [15]:
test_target.analysis_results

{'FinalModel': {'r2_score': 0.8261436181418973, 'mae': 620.4309888844803}}