In [102]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

In [103]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bike_sharing = fetch_ucirepo(id=275) 
  
# data (as pandas dataframes) 
X = bike_sharing.data.features 
y = bike_sharing.data.targets 
  
df = X.copy()
df['cnt'] = y

# metadata 
# print(bike_sharing.metadata) 
  
# # variable information 
# print(bike_sharing.variables) 

# bike_sharing.data.features.head()
df.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


In [104]:

# filepath = "Datasets/hour.csv"
# df = pd.read_csv(filepath)
# df.head()

In [105]:
drop_cols = ['atemp', 'instant', 'dteday', 'casual', 'registered']
numeric_features = ['temp', 'hum', 'windspeed']
categorical_features = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']

In [106]:
X = df.copy()
for c in drop_cols:
    if c in X.columns:
        X = X.drop(columns=c)
        print(f"Dropped column: {c}")
y = df['cnt']

keep_cols = [c for c in X.columns if c in numeric_features or c in categorical_features]
X = X[keep_cols]
X.shape
X.head()

Dropped column: atemp
Dropped column: dteday


Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed
0,1,0,1,0,0,6,0,1,0.24,0.81,0.0
1,1,0,1,1,0,6,0,1,0.22,0.8,0.0
2,1,0,1,2,0,6,0,1,0.22,0.8,0.0
3,1,0,1,3,0,6,0,1,0.24,0.75,0.0
4,1,0,1,4,0,6,0,1,0.24,0.75,0.0


In [107]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (13903, 11), Test shape: (3476, 11)


In [108]:
# delete cosnt cols
const_cols = X_train.columns[X_train.nunique(dropna=True) <= 1].tolist()
if const_cols:
    print(f"Remove const columns:", const_cols)
    X_train = X_train.drop(columns=const_cols)
    X_test = X_test.drop(columns=const_cols)


num_cols = [c for c in numeric_features if c in X_train.columns]
cat_cols = [c for c in categorical_features if c in X_train.columns]

print("Numerical columns:\n\n", num_cols)
print("Categorical columns:\n\n", cat_cols)

# numerical features: imputation + standard scaling
num_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scalar', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

# Column Transformer: num_pipeline auf num_cols, cat_pipeline auf cat_cols
preproc = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
    remainder='drop' # alle anderen Spalten verwerfen
)

# acutual pipeline, combining preprocessing and linear Regression model
final_pipeline = Pipeline(
    [
        ('preproc', preproc),
        ('rf', RandomForestRegressor(n_estimators=100, random_state=42))  # statt LinearRegression
    ]
)

# Pipeline fitten
final_pipeline.fit(X_train, y_train)
print("\n\n----- Pipeline fitted -----\n")

Numerical columns:

 ['temp', 'hum', 'windspeed']
Categorical columns:

 ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']


----- Pipeline fitted -----



In [109]:
y_pred = final_pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"R2 on test: {r2:.3f}")
print(f"RMSE on test: {rmse:.3f}")
print(f"MAPE on test: {mape:.3f} ({mape*100:.1f}%)")

R2 on test: 0.927
RMSE on test: 48.192
MAPE on test: 0.459 (45.9%)


In [None]:
# output for reading dataset file
# R2 on test: 0.927
# RMSE on test: 48.192
# MAPE on test: 0.459 (45.9%)

# output for reading from library