In [6]:
# Requirements for the project

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

In [7]:
file_path = "Datasets/hour.csv"
df = pd.read_csv(file_path)

# Viewing the data
# df.info()
# df.head()
# print('\n\ndescribe\n\n')
# df.describe()

drop_cols = ['casual', 'registered', 'instant', 'dteday']
numeric_features = ['temp', 'hum', 'atemp']
categorical_features = ['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']

In [8]:
X = df.copy()
for c in drop_cols:
    if c in X.columns:
        X = X.drop(columns=[c])
        # print(f"Dropped column: {c}")
if 'cnt' in X.columns: #falls cnt versehentlich in X ist, remove it
    X = X.drop(columns=['cnt'])
y = df['cnt'] # Zielvariable
#only numerical values for simple linear regression model

keep_cols = [c for c in X.columns if c in numeric_features or c in categorical_features]
X = X[keep_cols]


In [9]:
# train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# random_state sorgt für Reproduzierbarkeit der Aufteilung
print("\n\nTrain shape:", X_train.shape, "Test shape:", X_test.shape)



Train shape: (13903, 10) Test shape: (3476, 10)


In [12]:
#Preprocessing +  Linear Regresssion Pipeline erstellen und fitten

# delete cosnt cols
const_cols = X_train.columns[X_train.nunique(dropna=True) <= 1].tolist()
if const_cols:
    print(f"Remove const columns:", const_cols)
    X_train = X_train.drop(columns=const_cols)
    X_test = X_test.drop(columns=const_cols)


num_cols = [c for c in numeric_features if c in X_train.columns]
cat_cols = [c for c in categorical_features if c in X_train.columns]

print("Numerical columns:\n\n", num_cols)
print("Categorical columns:\n\n", cat_cols)

# numerical features: imputation + standard scaling
num_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scalar', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

# Column Transformer: num_pipeline auf num_cols, cat_pipeline auf cat_cols
preproc = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
    remainder='drop' # alle anderen Spalten verwerfen
)

# acutual pipeline, combining preprocessing and linear Regression model
final_pipeline = Pipeline(
    [
        ('preproc', preproc),
        ('lr', LinearRegression())
    ]
)

# Pipeline fitten
y_transform = np.log1p(y_train)
final_pipeline.fit(X_train, y_transform)

ttr = TransformedTargetRegressor(regressor=final_pipeline, func=np.log1p, inverse_func=np.expm1) 
ttr.fit(X_train, y_train) # fittet preproc + lr auf X_train und log1p(y_train) intern
y_pred_ttr = ttr.predict(X_test) # gibt direkte Vorhersage auf Originalskala zurück
r2_ttr = r2_score(y_test, y_pred_ttr)
rmse_ttr = np.sqrt(mean_sqaured_error(y_test, y_pred_ttr))
mape_ttr = mean_absolute_percentage_error(y_test, y_pred_ttr)
print("TransformedTargetRegressor (log1p) -> R2: {:.3f}, RMSE: {:.3f}, MAPE: {:.3f} ({:.1f}%))"
    .format(r2_ttr, rmse_ttr, mape_ttr, mape_ttr*100))

print("\n\n----- Pipeline fitted -----\n")

Numerical columns:

 ['temp', 'hum', 'atemp']
Categorical columns:

 ['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']


NameError: name 'mean_sqaured_error' is not defined

In [11]:

y_pred = final_pipeline.predict(X_test)
test = np.expm1(y_pred)  # inverse of log1p
r2 = r2_score(y_test, test)
rmse = np.sqrt(mean_squared_error(y_test, test))

print(f"R2 on test: {r2:.3f}")
print(f"RMSE on test: {rmse:.3f}")

R2 on test: 0.626
RMSE on test: 108.815
