## Decision Tree V1.1
- specific datetime features ('hour', 'year', 'day_of_week', 'is_weekend')
- random cross validation

**Hyperparameter for tweeking:**
- maximale Baumtiefe
- Mindestanzahl an Daten pro Blatt
- Mindestanzahl an Daten pro Knoten
- Features

**Regularisierung:**
- Pruning

#### **_PREPARATION_**

In [1]:
# GET ALL THE JSONS INTO ONE DATAFRAME
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import glob

In [2]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("..","fine_dust_complete.csv")
df = pd.read_csv(file_path_mc124)

df.dropna(inplace=True)
df.info()

FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\fine_dust_complete.csv'

In [3]:
df_pm10 = df[(df['core'] == 'pm10')]
df_pm10.head()

Unnamed: 0,datetime,station,core,value,hour,day,month,year,day_of_week,is_weekend
188328,2016-03-31 23:00:00,mc124,pm10,16.0,23,31,3,2016,3,0
188332,2016-03-31 22:00:00,mc124,pm10,22.0,22,31,3,2016,3,0
188336,2016-03-31 21:00:00,mc124,pm10,22.0,21,31,3,2016,3,0
188340,2016-03-31 20:00:00,mc124,pm10,24.0,20,31,3,2016,3,0
188344,2016-03-31 19:00:00,mc124,pm10,21.0,19,31,3,2016,3,0


#### **_ACTUAL MODEL TRAINING_**


#### Part 1: create decision tree on everything

In [4]:
# create features and to be predicted value
y = df_pm10['value']
X = df_pm10[['hour', 'year', 'day_of_week', 'is_weekend']]

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree

In [6]:
dt = DecisionTreeRegressor(max_depth=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.02627210844581085

In [7]:
dt.score(X_test, y_test)

0.01678084792372192

In [8]:
dt = DecisionTreeRegressor(max_depth=32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.07351349770706728

In [9]:
dt.score(X_test, y_test)
# R2 score: vergleichen Modellvorhersagen mot der Abweichung vom Mittelwert, im Vergleich mit absoluten banalsten Mittelwert vorhersage
# d.h. Vergleicht die Vorhersage des Modells vs einfach den absoluten Mittelwert für die X vorherzusagen

0.008188842585578615

In [10]:
dt.tree_.max_depth

17

#### CROSS-VALIDATION HYPERPARAMETER TUNING

In [11]:
# cross validate
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, root_mean_squared_error

# Create a DecisionTreeRegressor
decision_tree = DecisionTreeRegressor(max_depth=30)

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}
# Perform cross-validation
# scores = cross_validate(decision_tree, X, y, cv=2, scoring=scorers, return_train_score=True)
scores = cross_validate(decision_tree, X, y, cv=5, scoring=scorers)

# Print the results
print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {-1*np.mean(score):.2f} (+/- {np.std(score):.2f})")

Cross-validation scores:
fit_time: -0.03 (+/- 0.01)
score_time: -0.01 (+/- 0.01)
test_mean_absolute_error: 9.31 (+/- 0.63)
test_median_absolute_error: 7.52 (+/- 0.61)
test_root_mean_squared_error: 13.70 (+/- 0.55)
test_mean_squared_error: 188.00 (+/- 15.35)


In [12]:
# average pm10 value for comparison to the error rates
average_pm10 = df_pm10['value'].mean()
print(f"Average PM10 value: {average_pm10:.2f}")

Average PM10 value: 22.70


In [13]:
median_pm10= df_pm10['value'].median()
print(f"Median PM10 value: {median_pm10:.2f}")

Median PM10 value: 20.00


In [14]:
#----------------------------------------------------
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.datasets import make_regression
import numpy as np

param_grid = {
    'max_depth': range(5, 35, 5),
    'min_samples_leaf': [5, 10, 15, 20, 25, 30, 40, 50],
    'max_features': ["sqrt", "log2"],  # "auto" is deprecated and not used for decision trees
    'random_state': [123]
}

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}

# Initialize the DecisionTreeRegressor
regressor = DecisionTreeRegressor()

# Initialize GridSearchCV with the regressor and the parameter grid
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, scoring='neg_median_absolute_error')

# Fit GridSearchCV to the data
grid_search.fit(X, y)

# Get the best estimator from the grid search
best_regressor = grid_search.best_estimator_

# Perform cross-validation with the best estimator and the scoring functions
scores = cross_validate(best_regressor, X, y, cv=5, scoring=scorers, return_train_score=True)

print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {-1*np.mean(score):.2f} (+/- {np.std(score):.2f})")

Cross-validation scores:
fit_time: -0.06 (+/- 0.00)
score_time: -0.01 (+/- 0.00)
test_mean_absolute_error: 9.16 (+/- 0.48)
train_mean_absolute_error: 8.52 (+/- 0.17)
test_median_absolute_error: 7.41 (+/- 0.43)
train_median_absolute_error: 6.73 (+/- 0.14)
test_root_mean_squared_error: 13.49 (+/- 0.73)
train_root_mean_squared_error: 12.92 (+/- 0.25)
test_mean_squared_error: 182.57 (+/- 19.87)
train_mean_squared_error: 166.89 (+/- 6.38)


In [15]:
# kein Overfitting, da train und validate nah aneinander dran sind

In [16]:
best_regressor