In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

file_path = "../Datasets/hour.csv"
df = pd.read_csv(file_path)

drop_cols = ['instant', 'dteday', 'casual', 'registered', 'atemp']
numeric_features = ['temp', 'hum', 'windspeed']
catigorical_features = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']

In [2]:
X = df.copy()
for c in drop_cols:
    if c in X.columns:
        X = X.drop(columns=c)
        print(f"Dropped column: {c}")
y = df['cnt']

keep_cols = [c for c in X.columns if c in numeric_features or c in catigorical_features]
X = X[keep_cols]
X.shape

Dropped column: instant
Dropped column: dteday
Dropped column: casual
Dropped column: registered
Dropped column: atemp


(17379, 11)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")

X_train: (13903, 11), X_test: (3476, 11)


In [4]:
const_cols = X_train.columns[X_train.nunique(dropna=True) <= 1].tolist()
if const_cols:
    print(f"Remove const columns:", const_cols)
    X_train = X_train.drop(columns=const_cols)
    X_test = X_test.drop(columns=const_cols)

num_cols = [c for c in numeric_features if c in X_train.columns]
cat_cols = [c for c in catigorical_features if c in X_train.columns]

print(f"Numerical columns: {num_cols}")
print(f"Categorical columns: {cat_cols}")

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  
])

preproc = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

final_pipeline = Pipeline([
    ('preproc', preproc),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42, objective='reg:squarederror', n_jobs=-1, use_label_encoder=False, tree_method='hist'))
])

final_pipeline.fit(X_train, y_train)

print("--- Model trained ---")

Numerical columns: ['temp', 'hum', 'windspeed']
Categorical columns: ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']
--- Model trained ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [5]:
#  --- Evaluation on test set --- 
y_pred = final_pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"R2 on test:     {r2:.3f}")
print(f"RMSE on test:   {rmse:.3f}")
print(f"MAPE on test:   {mape:.3f} ({mape*100:.1f}%)")

R2 on test:     0.936
RMSE on test:   45.192
MAPE on test:   0.657 (65.7%)
