In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [6]:
trainset = pd.read_csv('data/fahrgastzahlen_2022/cleaned.csv', sep=',')
testset = pd.read_csv('data/fahrgastzahlen_2023/cleaned.csv', sep=',')

In [7]:
trainset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960951 entries, 0 to 960950
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Time           960951 non-null  float64
 1   Nachtnetz      960951 non-null  int64  
 2   Capacity       960951 non-null  float64
 3   Occupancy      960951 non-null  float64
 4   GPS_Latitude   960951 non-null  float64
 5   GPS_Longitude  960951 non-null  float64
 6   Weekday        960951 non-null  int64  
dtypes: float64(5), int64(2)
memory usage: 51.3 MB


In [8]:
testset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1127063 entries, 0 to 1127062
Data columns (total 7 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   Time           1127063 non-null  float64
 1   Nachtnetz      1127063 non-null  int64  
 2   Capacity       1127063 non-null  int64  
 3   Occupancy      1127063 non-null  float64
 4   GPS_Latitude   1127063 non-null  float64
 5   GPS_Longitude  1127063 non-null  float64
 6   Weekday        1127063 non-null  int64  
dtypes: float64(4), int64(3)
memory usage: 60.2 MB


In [9]:
# Fit the scaler on the training set only
scaler = StandardScaler()
columns_to_normalize = ['Time', 'GPS_Latitude', 'GPS_Longitude',
                    # 'Occupancy' not neccesary to norm?
                    # 'Capacity' only to use to restore the number of empty seats
                       ]
scaler.fit(trainset[columns_to_normalize])

# Transform both training and test sets
trainset[columns_to_normalize] = scaler.transform(trainset[columns_to_normalize])
testset[columns_to_normalize] = scaler.transform(testset[columns_to_normalize])

In [10]:
X_train = trainset.drop(columns=['Capacity', 'Occupancy'])
y_train = trainset['Occupancy']
X_test = testset.drop(columns=['Capacity', 'Occupancy'])
y_test = testset['Occupancy']

In [11]:
X_train.head()

Unnamed: 0,Time,Nachtnetz,GPS_Latitude,GPS_Longitude,Weekday
0,1.329027,0,0.491249,-1.005551,1
1,1.329027,0,0.491249,-1.005551,1
2,1.417528,0,0.491249,-1.005551,1
3,1.417528,0,0.491249,-1.005551,1
4,1.50603,0,0.491249,-1.005551,1


In [12]:
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

def calc_metrics(y_true, y_pred):
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    
    print(f"\tMean Absolute Percentage Error (MAPE):\t{mape}")
    print(f"\tMean Absolute Error (MAE):\t{mae}")
    print(f"\tMean Squared Error (MSE):\t{mse}")
    
    return mape, mae, mse

In [13]:
def run_models(models, X_train, y_train, X_test, y_test):
    metrics = {"Model": [], "MAPE": [], "MAE": [], "MSE": []}
    
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"Metrics for {model_name}:")
        mape, mae, mse = calc_metrics(y_test, y_pred)
        metrics["Model"].append(model_name)
        metrics["MAPE"].append(mape)
        metrics["MAE"].append(mae)
        metrics["MSE"].append(mse)
        
    return pd.DataFrame.from_dict(metrics)

In [14]:
import seaborn as sns

def boxplot_metrics(metrics_df):
    ax = sns.boxplot(x="Metric", y="MAPE", data=metrics_df)
    ax.set_title("MAPE for Different Models")
    plt.show()
    
    ax = sns.boxplot(x="Metric", y="MAE", data=metrics_df)
    ax.set_title("MAE for Different Models")
    plt.show()
    
    ax = sns.boxplot(x="Metric", y="MSE", data=metrics_df)
    ax.set_title("MSE for Different Models")
    plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Please read the documentation and try out different hyperparameters (C, gamma, ...)
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=1000, random_state=42),
    "Support Vector Machine": SVR()
}

metrics_df = run_models(models, X_train, y_train, X_test, y_test)
boxplot_metrics(metrics_df)

Training Linear Regression...
Metrics for Linear Regression:
	Mean Absolute Percentage Error (MAPE):	6391532306742.053
	Mean Absolute Error (MAE):	0.17998406745708823
	Mean Squared Error (MSE):	0.054279825619462305
Training Random Forest...
Metrics for Random Forest:
	Mean Absolute Percentage Error (MAPE):	2065665587694.439
	Mean Absolute Error (MAE):	0.11606559853543401
	Mean Squared Error (MSE):	0.026427907182957636
Training Support Vector Machine...


In [None]:
# Then select the best model and do the following:
from sklearn.feature_selection import SequentialFeatureSelector

for n_features in range(1, 11):
    model = ... # create a new instance
    model_name = ... # set the model name
    sfs = SequentialFeatureSelector(model, n_features_to_select=n_features)
    sfs.fit(X_train, y_train)
    print(f"Selected features for {n_features} features: {sfs.get_support()}")
    y_pred = model.predict(X_test)
    print(f"Metrics for {model_name}:")
    mape, mae, mse = calc_metrics(y_test, y_pred)