#  <ins>Verkehrsuntersuchung - Machine Learning</ins>

## Initialisierung

In [1]:
import gzip
import numpy as np
import pandas as pd
from sqlalchemy import create_engine#, text, MetaData, Table, Column, String
from geopy.geocoders import Nominatim
import seaborn as sns
import matplotlib.pyplot as plt
import holidays
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
sql_password = '' # Benutze hier dein MySQL- Passwort
engine = create_engine('mysql+mysqlconnector://root:' + sql_password + '@localhost:3306/verkehrsprojekt')
connection = engine.connect()

## Machine Learning

#### Dataframe erstellen

In [3]:
# pkw- Spalte

query = f"""
SELECT  
    timestamp, Durchschnitt
FROM 
    pkw_daten
"""
df = pd.read_sql(query,engine)
df = df.rename(columns = {'Durchschnitt':'Anzahl PKW'})
df  = df.set_index('timestamp')
pkw_spalte = df

In [4]:
# Fahrrad- Spalte

query = f"""
SELECT  
    timestamp, Durchschnitt
FROM 
    fahrraddaten
"""
df = pd.read_sql(query,engine)
df = df.rename(columns = {'Durchschnitt':'Anzahl Fahrräder'})
df  = df.set_index('timestamp')
fahrrad_spalte = df

In [5]:
# Dataframe für Machine learning erstellen

df = pd.read_csv('Wetterdaten_Bezirke_Durchschnitt.csv', decimal = '.' )
df['time'] = pd.to_datetime(df['time'])
df['precipitation (mm)'] = df['rain (mm)'] + 10*df['snowfall (cm)']
df = df.drop(columns = ['rain (mm)', 'snowfall (cm)'])
df['dayofweek'] = df['time'].dt.dayofweek
df['month'] = df['time'].dt.month
df['hour'] = df['time'].dt.hour 
berlin_holidays = holidays.Germany(state = 'BE')
df['is_holiday'] = df['time'].apply(lambda x: berlin_holidays.get(x, None))
df['is_holiday'] = df['is_holiday'].apply(lambda x: 1 if isinstance(x, str) else 0) # Es ist komisch, dass ich das in zwei Schritten machen muss. Aber die isin- methode hat komische Ergebnisse produziert
df = df.rename(columns = {'time':'timestamp'})
df = df.set_index('timestamp')
df = pd.concat([df, pkw_spalte, fahrrad_spalte], axis = 1)
df['n'] = df['Anzahl PKW'] + df['Anzahl Fahrräder']

# Normalisierung
scaler = MinMaxScaler()
#df[['relative_humidity_2m (%)','cloud_cover (%)', 'temperature_2m (°C)']] = scaler.fit_transform(df[['relative_humidity_2m (%)','cloud_cover (%)', 'temperature_2m (°C)']])

In [6]:
#wetter_features = df[['temperature_2m (°C)','temperature_2m (°C) ^2', 'relative_humidity_2m (%)', 'relative_humidity_2m (%) ^2', 'cloud_cover (%)', 'cloud_cover (%) ^2', 'precipitation (mm)', 'precipitation (mm) ^2']]
wetter_features = df[['temperature_2m (°C)', 'relative_humidity_2m (%)', 'cloud_cover (%)', 'precipitation (mm)']]
zeit_features = df[['dayofweek', 'month', 'hour']]
zeit_features = pd.concat( [ zeit_features.drop('dayofweek', axis = 1), pd.get_dummies(zeit_features['dayofweek'], prefix = 'dow', dtype = int) ], axis = 1)
zeit_features = pd.concat( [ zeit_features.drop('month', axis = 1), pd.get_dummies(zeit_features['month'], prefix = 'month', dtype = int) ], axis = 1)
zeit_features = pd.concat( [ zeit_features.drop('hour', axis = 1), pd.get_dummies(zeit_features['hour'], prefix = 'hour', dtype = int) ], axis = 1)

In [7]:
# NaN filtern
df = df[~df['Anzahl PKW'].isna()]
df = df[~df['Anzahl Fahrräder'].isna()]

In [8]:
# Aureißer ausschließen
problem_tage = [
'2018-04-25',
'2019-07-28',
'2019-10-20',
'2021-12-13',
'2023-01-30',
'2023-05-15',
]
problem_tage = pd.to_datetime(problem_tage)
filt = df.index.normalize().isin(problem_tage)
df = df[~filt]

In [9]:
df_unscaled = df.copy()

#### Betrachte den Dataframe

In [10]:
df.head()

Unnamed: 0_level_0,temperature_2m (°C),relative_humidity_2m (%),cloud_cover (%),precipitation (mm),dayofweek,month,hour,is_holiday,Anzahl PKW,Anzahl Fahrräder,n
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-01 00:00:00,11.066667,71.666667,61.333333,0.0,0,1,0,1,183.208,6.538462,189.746462
2018-01-01 01:00:00,11.141667,70.0,77.25,0.016667,0,1,1,1,357.316,10.730769,368.046769
2018-01-01 02:00:00,11.591667,64.5,92.416667,0.0,0,1,2,1,359.928,15.153846,375.081846
2018-01-01 03:00:00,11.825,62.083333,95.916667,0.0,0,1,3,1,284.856,13.269231,298.125231
2018-01-01 04:00:00,11.641667,63.416667,93.916667,0.0,0,1,4,1,225.944,8.115385,234.059385


In [11]:
#sns.pairplot(df, plot_kws={"s": 0.05})

In [12]:
df[df['temperature_2m (°C)'].isna()]

Unnamed: 0_level_0,temperature_2m (°C),relative_humidity_2m (%),cloud_cover (%),precipitation (mm),dayofweek,month,hour,is_holiday,Anzahl PKW,Anzahl Fahrräder,n
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [13]:
renaming_dict = {
'temperature_2m (°C)': 'Temperature (°C)',
'relative_humidity_2m (%)': 'rel. Humidity (%)',
'cloud_cover (%)': 'Clouding Cover (%)',
'precipitation (mm)': 'Precipitation (mm)',
'dayofweek':'Day of Week',
'month': 'Month',
'hour': 'Hour',
'is_holiday': 'is Holiday',
'Anzahl PKW': 'Car count',
'Anzahl Fahrräder': 'Bike count'
}
df = df.rename(columns = renaming_dict)

In [14]:
df[['Car count', 'Bike count']].describe()

Unnamed: 0,Car count,Bike count
count,51886.0,51886.0
mean,440.466693,90.083151
std,247.726517,79.933258
min,20.661597,0.115385
25%,187.627605,19.213942
50%,475.621792,70.423077
75%,657.805121,141.133333
max,923.629482,423.5


#### Automatisierte Analyse

In [15]:
"""wetter_features = df_unscaled[['temperature_2m (°C)', 'relative_humidity_2m (%)', 'cloud_cover (%)', 'precipitation (mm)']]
zeit_features = df_unscaled[['dayofweek', 'month', 'hour']]
zeit_features = pd.concat( [ zeit_features.drop('dayofweek', axis = 1), pd.get_dummies(zeit_features['dayofweek'], prefix = 'dow', dtype = int) ], axis = 1)
zeit_features = pd.concat( [ zeit_features.drop('month', axis = 1), pd.get_dummies(zeit_features['month'], prefix = 'month', dtype = int) ], axis = 1)
zeit_features = pd.concat( [ zeit_features.drop('hour', axis = 1), pd.get_dummies(zeit_features['hour'], prefix = 'hour', dtype = int) ], axis = 1)"""

"wetter_features = df_unscaled[['temperature_2m (°C)', 'relative_humidity_2m (%)', 'cloud_cover (%)', 'precipitation (mm)']]\nzeit_features = df_unscaled[['dayofweek', 'month', 'hour']]\nzeit_features = pd.concat( [ zeit_features.drop('dayofweek', axis = 1), pd.get_dummies(zeit_features['dayofweek'], prefix = 'dow', dtype = int) ], axis = 1)\nzeit_features = pd.concat( [ zeit_features.drop('month', axis = 1), pd.get_dummies(zeit_features['month'], prefix = 'month', dtype = int) ], axis = 1)\nzeit_features = pd.concat( [ zeit_features.drop('hour', axis = 1), pd.get_dummies(zeit_features['hour'], prefix = 'hour', dtype = int) ], axis = 1)"

In [16]:
if True:
    for splitting_method in ['train_test_split']: #['train_test_split', 'vor/nach 1. Jan 2023']:
        for feature_auswahl in ['Wetter']:#['Wetter','Zeit','Wetter + Zeit']:
            for fahrzeug in ['Fahrräder']:#['PKW', 'Fahrräder']:
                for ML_Alg in ['LinReg']:#['LinReg', 'GradientBoost','SVR','RandomForest']:
                    best_r2 = - float("inf")
                    df = df_unscaled.drop(columns = df_unscaled.columns)
                    use_weather_features = ('Wetter' in feature_auswahl)
                    use_time_features = ('Zeit' in feature_auswahl)
                    if use_weather_features:
                        df[wetter_features.columns] = wetter_features
                    if use_time_features:
                        df[zeit_features.columns] = zeit_features
                    df[[f'Anzahl {fahrzeug}']] = df_unscaled[[f'Anzahl {fahrzeug}']]
                    
                    X = df.drop(columns = [f'Anzahl {fahrzeug}'])
                    y = df[[f'Anzahl {fahrzeug}']]
    
                    if splitting_method == 'train_test_split':
                        X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8)
                    else:
                        split_filt = ( y.index < '2023-01-01 00:00:00' )
                        X_train, y_train = X[split_filt], y[split_filt]
                        X_test, y_test = X[~split_filt], y[~split_filt]
    
                    if ML_Alg == 'LinReg':
                        model = LinearRegression()
                    elif ML_Alg == 'GradientBoost':
                        model == GradientBoostingRegressor()
                    elif ML_Alg == 'SVR':
                        model = SVR(kernel='rbf', C=100)
                    elif ML_Alg == 'RandomForest':
                        model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth = 10, min_samples_leaf = 1, min_samples_split = 10)
                    
                    model.fit(X_train, y_train[f'Anzahl {fahrzeug}'])
                    y_pred = model.predict(X_test)
                    #mse = mean_squared_error(y_test, y_pred)
                    #sd = np.sqrt(mse)
                    r2 = r2_score(y_test, y_pred)
    
                    print(f"Betrachtetes Fahrzeug: {fahrzeug}")
                    print(f"Verwendeter Algorithmus: {ML_Alg}")
                    print(f'Splitting- Methode: {splitting_method}')
                    print(f"Verwendete Features: {feature_auswahl}")
                    print(f"bester R^2: {r2}")
                    print("")
                    print("")

Betrachtetes Fahrzeug: Fahrräder
Verwendeter Algorithmus: LinReg
Splitting- Methode: train_test_split
Verwendete Features: Wetter
bester R^2: 0.3478314272354167


