In [1]:
from idstools._config import load_config
from idstools._objects import Target
from idstools.preparation import DataPreparation
from idstools.optimization import ModelOptimization

In [2]:
# Load configuration
config = load_config("../config/idstools/TestModel.yaml")

In [3]:
# Train Data
id = 1
input_path = "data/BikeRentalDaily_train.csv"
input_delimiter = ";"
label = "cnt"
index = "instant"

In [4]:
# Test Data
id = 2
input_path = "data/BikeRentalDaily_test.csv"
input_delimiter = ";"
label = "cnt"
index = "instant"

In [5]:
# Initialize Targets
train_target = Target(id, input_path, input_delimiter, label, index)
test_target = Target(id, input_path, input_delimiter, label, index)

2024-03-02 12:49:54,069 [_objects] [INFO] - Initializing TargetData object.
2024-03-02 12:49:54,071 [_helpers] [INFO] - Reading data from:
/home/davidrmn/Studies/introduction-data-science/data/BikeRentalDaily_test.csv
2024-03-02 12:49:54,074 [_objects] [INFO] - Using id: 2
2024-03-02 12:49:54,075 [_objects] [INFO] - Using label: cnt
2024-03-02 12:49:54,075 [_objects] [INFO] - Using index: instant
2024-03-02 12:49:54,075 [_objects] [INFO] - No features provided.
2024-03-02 12:49:54,076 [_objects] [INFO] - No environment name provided.
Using default environment name: SELF_EXECUTED
2024-03-02 12:49:54,076 [_objects] [INFO] - No step name provided.
2024-03-02 12:49:54,076 [_objects] [INFO] - Output path not provided.
Using default output path: /home/davidrmn/Studies/introduction-data-science/results
2024-03-02 12:49:54,077 [_objects] [INFO] - Initializing TargetData object.
2024-03-02 12:49:54,077 [_helpers] [INFO] - Reading data from:
/home/davidrmn/Studies/introduction-data-science/data/

In [6]:
# Preprocessing Pipeline
pipeline = config.default.preprocessing.preparation.DataPreparation.pipeline

In [7]:
# Initialize DataPreparation 
preprcessing_pipeline = DataPreparation([train_target, test_target], pipeline=pipeline)

2024-03-02 12:49:54,169 [preparation] [INFO] - Initializing DataPreparation for multiple targets.


In [8]:
train_target.data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season,121.0,2.652893,1.123319,1.0,2.0,3.0,4.0,4.0
yr,132.0,0.515152,0.501674,0.0,0.0,1.0,1.0,1.0
mnth,132.0,6.75,3.497818,1.0,3.75,7.0,10.0,12.0
holiday,132.0,0.022727,0.1496,0.0,0.0,0.0,0.0,1.0
weekday,132.0,2.590909,2.154188,-1.0,1.0,2.0,5.0,6.0
workingday,132.0,0.704545,0.457985,0.0,0.0,1.0,1.0,1.0
weathersit,132.0,1.371212,0.55815,1.0,1.0,1.0,2.0,3.0
temp,132.0,19.844301,7.827241,3.8957,13.3,19.2,27.366675,33.9333
atemp,132.0,0.477679,0.173988,0.11793,0.325754,0.470002,0.635919,0.840896
hum,126.0,93.645418,21.467624,0.0,80.692913,93.46875,107.484413,139.5


In [9]:
# Run DataPreparation
preprcessing_pipeline.run()

2024-03-02 12:49:54,208 [preparation] [INFO] - Pipeline created:
_StandardScaler:
- target:
  - leaflets
  - hum
_CustomTransformer:
- func: target_to_datetime
  module: idstools._transformer
  config:
    target: dteday
    format: '%d.%m.%Y'
- func: impute_season
  module: idstools._transformer
  config:
    target: season
    date: dteday
- func: negative_to_nan
  module: idstools._transformer
  config:
    target: windspeed
- func: process_weekday
  module: idstools._transformer
  config:
    target: weekday
    date: dteday
- func: remove_outliers
  module: idstools._transformer
  config:
    target: cnt
_OneHotEncoder:
- target: mnth
  config:
    prefix: mnth
    dtype: int
    drop_first: true
- target: weekday
  config:
    prefix: weekday
    dtype: int
    drop_first: true
- target: weathersit
  config:
    prefix: weathersit
    dtype: int
    drop_first: true
_SimpleImputer:
- target:
  - hum
  - windspeed
  config:
    strategy: mean
_FeatureDropper:
- target:
  - casual


In [10]:
train_target.processed_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season,131.0,2.671756,1.112466,1.0,2.0,3.0,4.0,4.0
yr,131.0,0.51145,0.501788,0.0,0.0,1.0,1.0,1.0
holiday,131.0,0.022901,0.150161,0.0,0.0,0.0,0.0,1.0
atemp,131.0,0.477278,0.174595,0.11793,0.325125,0.469054,0.636282,0.840896
hum,131.0,0.004567,0.983211,-4.379583,-0.600166,0.004567,0.617255,2.144515
windspeed,131.0,0.185027,0.06857,0.042304,0.135583,0.178479,0.226681,0.385571
leaflets,131.0,-0.010794,0.999943,-1.639572,-0.942248,-0.042475,0.919157,1.565869
price reduction,131.0,0.167939,0.375247,0.0,0.0,0.0,0.0,1.0
cnt,131.0,4543.412214,1961.288159,431.0,3269.0,4570.0,6027.0,7852.0
mnth_2,131.0,0.061069,0.240376,0.0,0.0,0.0,0.0,1.0


In [None]:
optimization_pipeline = ModelOptimization([train_target, test_target], )