In [24]:
# Requirements for the project

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

In [None]:
file_path = "../Datasets/hour.csv"
df = pd.read_csv(file_path)

drop_cols = ['temp','casual', 'registered', 'instant', 'dteday', 'workingday', 'weekday', 'windspeed'] # so wie jetzt ist es am besten, features selection R2 = 0.6
numeric_features = ['atemp', 'hum']
categorical_features = ['season', 'mnth', 'hr', 'holiday', 'weathersit']

In [26]:
X = df.copy()
for c in drop_cols:
    if c in X.columns:
        X = X.drop(columns=[c])
if 'cnt' in X.columns: #falls cnt versehentlich in X ist, remove it
    X = X.drop(columns=['cnt'])
y = df['cnt'] # Zielvariable

# safety for keeping only the defined features
keep_cols = [c for c in X.columns if c in numeric_features or c in categorical_features]
X = X[keep_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
#Preprocessing +  Linear Regresssion Pipeline erstellen und fitten

# safety duplicated columns
const_cols = X_train.columns[X_train.nunique(dropna=True) <= 1].tolist()
if const_cols:
    print(f"Remove const columns:", const_cols)
    X_train = X_train.drop(columns=const_cols)
    X_test = X_test.drop(columns=const_cols)


num_cols = [c for c in numeric_features if c in X_train.columns]
cat_cols = [c for c in categorical_features if c in X_train.columns]

# numerical features: imputation + standard scaling
num_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scalar', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

# Column Transformer: num_pipeline auf num_cols, cat_pipeline auf cat_cols
preproc = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
    remainder='drop' # alle anderen Spalten verwerfen
)

# acutual pipeline, combining preprocessing and linear Regression model
final_pipeline = Pipeline(
    [
        ('preproc', preproc),
        ('lr', LinearRegression())
    ]
)

# fitting pipeline with log1p transformed target variable
y_transform = np.log1p(y_train)
final_pipeline.fit(X_train, y_transform)

print("\n\n----- Pipeline fitted -----")



----- Pipeline fitted -----


In [28]:
# Evaluation on test set
y_pred = final_pipeline.predict(X_test)
test = np.expm1(y_pred)  # inverse of log1p
r2 = r2_score(y_test, test)
rmse = np.sqrt(mean_squared_error(y_test, test))

print(f"R2 on test: {r2:.3f}")
print(f"RMSE on test: {rmse:.3f}")

R2 on test: 0.636
RMSE on test: 107.425
