In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformation_functions import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
import joblib

In [6]:
X=import_data() #import dataframe
X=X.drop(['atemp','hum','weekday'],axis=1) #drop columns due to multicollinearity 
X=transform_feature_weathersit(X) #replace value 4 with value 3 for feature 'weathersit'
X=create_lagged_values(X) # create lag values for features 'temp','cnt', 'windspeed', 'weathersit'

#Separate features and target variable
Features=X.drop(['cnt'],axis=1)
Target=X['cnt']


#normalize cnt values
scaler = StandardScaler()
Target=scaler.fit_transform(Target.values.reshape(-1, 1))
Features,Target=normalize_lag_values(scaler,Features,Target) #normalize lag values of cnt features using .transform



one_hot_encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False)#initialize onehotencoder 

categorical_cols=['lagged_weathersit_24h','lagged_weathersit_25h','season', 'mnth', 'hr']#columns which will be encoded


# Create ColumnTransformer 
column_transformer = ColumnTransformer(
    transformers=[('onehot', one_hot_encoder, categorical_cols)],
    remainder='passthrough'  # Pass through the remaining (numerical) columns
)


# Fit and transform the data
encoded_data = column_transformer.fit_transform(Features)
# Optionally convert back to a DataFrame
encoded_data_df = pd.DataFrame(
    encoded_data, 
    columns=column_transformer.get_feature_names_out()  # Get the new column names
)



#drop columns not relevat to prediction like dteday  
Features=drop_columns(encoded_data_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cnt'] = bike_sharing_dataset.data.targets


In [7]:
cb_model = CatBoostRegressor( depth= 8, 
                            iterations= 100,
                            l2_leaf_reg= 7, 
                            learning_rate= 0.4)
    
cb_model.fit(Features,Target)

0:	learn: 0.7462756	total: 145ms	remaining: 14.3s
1:	learn: 0.5894945	total: 154ms	remaining: 7.52s
2:	learn: 0.5134099	total: 160ms	remaining: 5.18s
3:	learn: 0.4775457	total: 165ms	remaining: 3.97s
4:	learn: 0.4491382	total: 171ms	remaining: 3.24s
5:	learn: 0.4396198	total: 176ms	remaining: 2.75s
6:	learn: 0.4261477	total: 181ms	remaining: 2.41s
7:	learn: 0.4184545	total: 186ms	remaining: 2.14s
8:	learn: 0.4150159	total: 192ms	remaining: 1.94s
9:	learn: 0.4115481	total: 197ms	remaining: 1.77s
10:	learn: 0.4057745	total: 202ms	remaining: 1.64s
11:	learn: 0.4029409	total: 207ms	remaining: 1.52s
12:	learn: 0.3988351	total: 211ms	remaining: 1.42s
13:	learn: 0.3957390	total: 216ms	remaining: 1.33s
14:	learn: 0.3938884	total: 221ms	remaining: 1.25s
15:	learn: 0.3907635	total: 225ms	remaining: 1.18s
16:	learn: 0.3888567	total: 230ms	remaining: 1.12s
17:	learn: 0.3857046	total: 234ms	remaining: 1.07s
18:	learn: 0.3813889	total: 239ms	remaining: 1.02s
19:	learn: 0.3783686	total: 243ms	remaini

<catboost.core.CatBoostRegressor at 0x262094b26d0>

In [9]:
cb_mae = mean_absolute_error(Target, cb_model.predict(Features))
print("CatBoost MSE on full set:", cb_mae)

CatBoost MSE on full set: 0.17244074662252962


In [10]:

#column transfortmer
joblib.dump(column_transformer, 'column_transformer_one_hot_encoder.pkl')

#save the one hot encoder to be used in other notebooks
joblib.dump(scaler, 'saved_standardScaler.pkl')


# Save the trained model to a file
joblib.dump(cb_model, "trained_regression_model.pkl")

"""based on the saved model and steps shown above a pipeline can be created which automizes the prediction process
Since we use the lagged values of cnt, it means a new function should be created in order to get these values.
"""

'based on the saved model and steps shown above a pipeline can be created which automizes the prediction process\nSince we use the lagged values of cnt, it means a new function should be created in order to get these values.\n'