## Decision Tree V1.1
- specific datetime features ('hour', 'year', 'day_of_week', 'is_weekend')
- random cross validation

**Hyperparameter for tweeking:**
- maximale Baumtiefe
- Mindestanzahl an Daten pro Blatt
- Mindestanzahl an Daten pro Knoten
- Features

**Regularisierung:**
- Pruning

#### **_PREPARATION_**

In [1]:
# GET ALL THE JSONS INTO ONE DATAFRAME
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import glob

In [16]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("..", "..","fine_dust_complete", ".csv")
df = pd.read_csv('../../data/df_fine_dust_wind_merged.csv', parse_dates=['datetime'])

df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 309715 entries, 172992 to 542554
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   datetime        309715 non-null  datetime64[ns]
 1   station         309715 non-null  object        
 2   core            309715 non-null  object        
 3   pm10_value      309715 non-null  float64       
 4   hour            309715 non-null  float64       
 5   day             309715 non-null  float64       
 6   month           309715 non-null  float64       
 7   year            309715 non-null  float64       
 8   day_of_week     309715 non-null  float64       
 9   is_weekend      309715 non-null  float64       
 10  wind_speed      309715 non-null  float64       
 11  wind_direction  309715 non-null  float64       
dtypes: datetime64[ns](1), float64(9), object(2)
memory usage: 30.7+ MB


In [17]:
df_pm10 = df[(df['core'] == 'pm10')]
df_pm10.tail()

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend,wind_speed,wind_direction
542530,2024-05-01 04:00:00,mc124,pm10,30.0,4.0,1.0,5.0,2024.0,2.0,0.0,16.1,130.0
542535,2024-05-01 03:00:00,mc124,pm10,29.0,3.0,1.0,5.0,2024.0,2.0,0.0,13.2,140.0
542540,2024-05-01 02:00:00,mc124,pm10,28.0,2.0,1.0,5.0,2024.0,2.0,0.0,13.2,140.0
542545,2024-05-01 01:00:00,mc124,pm10,28.0,1.0,1.0,5.0,2024.0,2.0,0.0,12.7,150.0
542550,2024-05-01 00:00:00,mc124,pm10,29.0,0.0,1.0,5.0,2024.0,2.0,0.0,14.1,160.0


In [19]:
# Create new columns for wind speed and wind direction of the previous hour
df_pm10 = df_pm10.copy()

df_pm10.loc[:, 'wind_speed_h-1'] = df_pm10.groupby(['station', 'core'])['wind_speed'].shift(-1)
df_pm10.loc[:, 'wind_direction_h-1'] = df_pm10.groupby(['station', 'core'])['wind_direction'].shift(-1)

# Verify the changes
df_pm10.tail()

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend,wind_speed,wind_direction,wind_speed_h-1,wind_direction_h-1
542530,2024-05-01 04:00:00,mc124,pm10,30.0,4.0,1.0,5.0,2024.0,2.0,0.0,16.1,130.0,13.2,140.0
542535,2024-05-01 03:00:00,mc124,pm10,29.0,3.0,1.0,5.0,2024.0,2.0,0.0,13.2,140.0,13.2,140.0
542540,2024-05-01 02:00:00,mc124,pm10,28.0,2.0,1.0,5.0,2024.0,2.0,0.0,13.2,140.0,12.7,150.0
542545,2024-05-01 01:00:00,mc124,pm10,28.0,1.0,1.0,5.0,2024.0,2.0,0.0,12.7,150.0,14.1,160.0
542550,2024-05-01 00:00:00,mc124,pm10,29.0,0.0,1.0,5.0,2024.0,2.0,0.0,14.1,160.0,,


In [23]:
df_pm10.dropna()
df_pm10.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60195 entries, 188328 to 542550
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   datetime            60195 non-null  datetime64[ns]
 1   station             60195 non-null  object        
 2   core                60195 non-null  object        
 3   pm10_value          60195 non-null  float64       
 4   hour                60195 non-null  float64       
 5   day                 60195 non-null  float64       
 6   month               60195 non-null  float64       
 7   year                60195 non-null  float64       
 8   day_of_week         60195 non-null  float64       
 9   is_weekend          60195 non-null  float64       
 10  wind_speed          60195 non-null  float64       
 11  wind_direction      60195 non-null  float64       
 12  wind_speed_h-1      60194 non-null  float64       
 13  wind_direction_h-1  60194 non-null  float64  

In [39]:
# export to new csv for following model training to avoid repetitive data manipulation
df_pm10.to_csv("df_fine_dust_wd_h-1.csv", index=False)

#### **_ACTUAL MODEL TRAINING_**


#### Part 1: create decision tree on everything

In [26]:
# create features and to be predicted value
y = df_pm10['pm10_value']
X = df_pm10[['hour', 'day', 'month', 'year', 'day_of_week', 'is_weekend', 'wind_speed_h-1', 'wind_direction_h-1']]

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree

In [28]:
dt = DecisionTreeRegressor(max_depth=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.09415877256238603

In [29]:
dt.score(X_test, y_test)

0.09873570133150356

In [30]:
dt = DecisionTreeRegressor(max_depth=32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.9999973143682909

In [31]:
dt.score(X_test, y_test)
# R2 score: vergleichen Modellvorhersagen mot der Abweichung vom Mittelwert, im Vergleich mit absoluten banalsten Mittelwert vorhersage
# d.h. Vergleicht die Vorhersage des Modells vs einfach den absoluten Mittelwert für die X vorherzusagen

0.08084162817193197

In [32]:
dt.tree_.max_depth

32

#### CROSS-VALIDATION HYPERPARAMETER TUNING

In [33]:
# cross validate
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, root_mean_squared_error

# Create a DecisionTreeRegressor
decision_tree = DecisionTreeRegressor(max_depth=30)

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}
# Perform cross-validation
# scores = cross_validate(decision_tree, X, y, cv=2, scoring=scorers, return_train_score=True)
scores = cross_validate(decision_tree, X, y, cv=5, scoring=scorers)

# Print the results
print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {-1*np.mean(score):.2f} (+/- {np.std(score):.2f})")

Cross-validation scores:
fit_time: -0.61 (+/- 0.03)
score_time: -0.01 (+/- 0.01)
test_mean_absolute_error: 11.34 (+/- 0.41)
test_median_absolute_error: 7.80 (+/- 0.75)
test_root_mean_squared_error: 17.40 (+/- 0.71)
test_mean_squared_error: 303.36 (+/- 24.85)


In [35]:
# average pm10 value for comparison to the error rates
average_pm10 = df_pm10['pm10_value'].mean()
print(f"Average PM10 value: {average_pm10:.2f}")

Average PM10 value: 23.22


In [36]:
median_pm10= df_pm10['pm10_value'].median()
print(f"Median PM10 value: {median_pm10:.2f}")

Median PM10 value: 20.00


In [37]:
#----------------------------------------------------
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.datasets import make_regression
import numpy as np

param_grid = {
    'max_depth': range(5, 35, 5),
    'min_samples_leaf': [5, 10, 15, 20, 25, 30, 40, 50],
    'max_features': ["sqrt", "log2"],  # "auto" is deprecated and not used for decision trees
    'random_state': [123]
}

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}

# Initialize the DecisionTreeRegressor
regressor = DecisionTreeRegressor()

# Initialize GridSearchCV with the regressor and the parameter grid
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, scoring='neg_median_absolute_error')

# Fit GridSearchCV to the data
grid_search.fit(X, y)

# Get the best estimator from the grid search
best_regressor = grid_search.best_estimator_

# Perform cross-validation with the best estimator and the scoring functions
scores = cross_validate(best_regressor, X, y, cv=5, scoring=scorers, return_train_score=True)

print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {-1*np.mean(score):.2f} (+/- {np.std(score):.2f})")

Cross-validation scores:
fit_time: -0.03 (+/- 0.01)
score_time: -0.00 (+/- 0.01)
test_mean_absolute_error: 8.72 (+/- 0.24)
train_mean_absolute_error: 6.95 (+/- 0.09)
test_median_absolute_error: 6.34 (+/- 0.17)
train_median_absolute_error: 5.12 (+/- 0.09)
test_root_mean_squared_error: 13.70 (+/- 0.80)
train_root_mean_squared_error: 10.95 (+/- 0.21)
test_mean_squared_error: 188.47 (+/- 22.48)
train_mean_squared_error: 119.98 (+/- 4.64)


In [15]:
# kein Overfitting, da train und validate nah aneinander dran sind

In [38]:
best_regressor