In [41]:
# --- Requirements --- 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

In [42]:
# --- loading data --- 
file_path = "../Datasets/hour.csv"
df = pd.read_csv(file_path)

#  --- defining features ---
drop_cols = ['instant', 'dteday', 'casual', 'registered', 'temp', 'yr', 'windspeed']
numeric_features = ['atemp', 'hum']
categorical_features = ['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']

#  --- deleting unwanted features --- 
X = df.drop(columns=drop_cols)
y = df['cnt']

#  --- safety for keeping only the defined features --- 
X = X[numeric_features + categorical_features]

#  --- splitting data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [43]:
# --- preprocessing and model pipeline ---

# --- safety check: removing constant columns --- 
const_cols = X_train.columns[X_train.nunique(dropna=True) <= 1].tolist()
if const_cols:
    print(f"Remove const columns:", const_cols)
    X_train = X_train.drop(columns=const_cols)
    X_test = X_test.drop(columns=const_cols)

num_cols = [c for c in numeric_features if c in X_train.columns]
cat_cols = [c for c in categorical_features if c in X_train.columns]

# --- numerical features: imputation + standard scaling --- 
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])

# --- categorical features: imputation + one hot encoding --- 
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  
])

# --- Column Transformer: num_pipeline auf num_cols, cat_pipeline auf cat_cols --- 
preproc = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# --- acutual pipeline, combining preprocessing and linear Regression model --- 
XGB_model = Pipeline([
    ('preproc', preproc),
    ('xgb', XGBRegressor(
        n_estimators=100,
        random_state=42,
        objective='reg:squarederror',
        n_jobs=-1,
        use_label_encoder=False,
        tree_method='hist'))
])

# --- fitting the pipeline ---
XGB_model.fit(X_train, y_train)

print("\033[94m\n\n----- Pipeline fitted -----\033[0m")

[94m

----- Pipeline fitted -----[0m


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [44]:
#  --- Evaluation on test set --- 
y_pred = XGB_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"R2 on test:     {r2:.3f}")
print(f"RMSE on test:   {rmse:.3f}")
print(f"MAPE on test:   {mape:.3f} ({mape*100:.1f}%)")

R2 on test:     0.853
RMSE on test:   68.152
MAPE on test:   0.753 (75.3%)
