In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

In [20]:
# from ucimlrepo import fetch_ucirepo 
  
# # fetch dataset 
# bike_sharing = fetch_ucirepo(id=275) 
  
# # data (as pandas dataframes) 
# X = bike_sharing.data.features 
# y = bike_sharing.data.targets 
  
# df = X.copy()
# df['cnt'] = y

# # metadata 
# # print(bike_sharing.metadata) 
  
# # # variable information 
# # print(bike_sharing.variables) 

# # bike_sharing.data.features.head()
# df.head()

In [21]:

filepath = "Datasets/hour.csv"
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [22]:
drop_cols = ['temp', 'instant', 'dteday', 'casual', 'registered', 'cnt']
numeric_features = ['atemp', 'hum', 'windspeed'] 
categorical_features = ['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']

In [23]:
X = df.drop(columns=drop_cols)
y = df['cnt']

# keep_cols = [c for c in X.columns if c in numeric_features or c in categorical_features]
# X = X[keep_cols]
X = X[numeric_features + categorical_features]
X.head()

Unnamed: 0,atemp,hum,windspeed,season,mnth,hr,holiday,weekday,workingday,weathersit
0,0.2879,0.81,0.0,1,1,0,0,6,0,1
1,0.2727,0.8,0.0,1,1,1,0,6,0,1
2,0.2727,0.8,0.0,1,1,2,0,6,0,1
3,0.2879,0.75,0.0,1,1,3,0,6,0,1
4,0.2879,0.75,0.0,1,1,4,0,6,0,1


In [24]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (13903, 10), Test shape: (3476, 10)


In [25]:
# delete cosnt cols
# const_cols = X_train.columns[X_train.nunique() <= 1].tolist()
# if const_cols:
#     print(f"Remove const columns:", const_cols)
#     X_train = X_train.drop(columns=const_cols)
#     X_test = X_test.drop(columns=const_cols)


num_cols = [c for c in numeric_features if c in X_train.columns]
cat_cols = [c for c in categorical_features if c in X_train.columns]

print("Numerical columns:\n\n", num_cols)
print("Categorical columns:\n\n", cat_cols)

# numerical features: imputation + standard scaling
num_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scalar', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

# Column Transformer: num_pipeline auf num_cols, cat_pipeline auf cat_cols
preproc = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
    remainder='drop' # alle anderen Spalten verwerfen
)

# acutual pipeline, combining preprocessing and linear Regression model
final_pipeline = Pipeline(
    [
        ('preproc', preproc),
        ('rf', RandomForestRegressor(n_estimators=200, random_state=42, max_depth=None, min_samples_split=2, min_samples_leaf=2))  # statt LinearRegression
    ]
)

# Pipeline fitten
final_pipeline.fit(X_train, y_train)
print("\n\n----- Pipeline fitted -----\n")

Numerical columns:

 ['atemp', 'hum', 'windspeed']
Categorical columns:

 ['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']


----- Pipeline fitted -----



In [26]:
y_pred = final_pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"R2 on test: {r2:.3f}")
print(f"RMSE on test: {rmse:.3f}")
print(f"MAPE on test: {mape:.3f} ({mape*100:.1f}%)")

R2 on test: 0.842
RMSE on test: 70.761
MAPE on test: 0.555 (55.5%)


In [27]:
# output for reading dataset file
# R2 on test: 0.927
# RMSE on test: 48.192
# MAPE on test: 0.459 (45.9%)

# output for reading from library
# R2 on test: 0.927
# RMSE on test: 48.192
# MAPE on test: 0.459 (45.9%)

In [28]:
# #Â ToDo: USe Gridsearch to find ideal hyperparameters
# #chagne pipeline name
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'rf__n_estimators': [50, 100, 200],
#     'rf__max_depth': [None, 10, 20],
#     'rf__min_samples_split': [2, 5],
#     'rf__min_samples_leaf': [1, 2]
# }

# grid = GridSearchCV(final_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
# grid.fit(X_train, y_train)

# print("Best parameters:", grid.best_params_)
# best_model = grid.best_estimator_

# y_pred = best_model.predict(X_test)
# print(f"R2: {r2_score(y_test, y_pred):.3f}")
# print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}")
# print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred):.3f} ({mean_absolute_percentage_error(y_test, y_pred)*100:.1f}%)")

In [29]:
#Best parameters: {'rf__max_depth': None, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 2, 'rf__n_estimators': 200