## Random Forest V1.1
- specific datetime features ('hour', 'year', 'day_of_week', 'is_weekend')
- random cross validation

**Hyperparameter for tweeking:**
- maximale Baumtiefe
- Mindestanzahl an Daten pro Blatt
- Mindestanzahl an Daten pro Knoten
- Features

**Regularisierung:**
- Pruning

#### **_PREPARATION_**

In [1]:
# GET ALL THE JSONS INTO ONE DATAFRAME
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import glob

In [2]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("..", "fine_dust_complete", ".csv")
df = pd.read_csv('../../data/fine_dust_complete.csv')

df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 539422 entries, 0 to 542554
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   datetime     539422 non-null  object 
 1   station      539422 non-null  object 
 2   core         539422 non-null  object 
 3   value        539422 non-null  float64
 4   hour         539422 non-null  int64  
 5   day          539422 non-null  int64  
 6   month        539422 non-null  int64  
 7   year         539422 non-null  int64  
 8   day_of_week  539422 non-null  int64  
 9   is_weekend   539422 non-null  int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 45.3+ MB


In [3]:
df_pm10 = df[(df['core'] == 'pm10')]
df_pm10.head()

Unnamed: 0,datetime,station,core,value,hour,day,month,year,day_of_week,is_weekend
188328,2016-03-31 23:00:00,mc124,pm10,16.0,23,31,3,2016,3,0
188332,2016-03-31 22:00:00,mc124,pm10,22.0,22,31,3,2016,3,0
188336,2016-03-31 21:00:00,mc124,pm10,22.0,21,31,3,2016,3,0
188340,2016-03-31 20:00:00,mc124,pm10,24.0,20,31,3,2016,3,0
188344,2016-03-31 19:00:00,mc124,pm10,21.0,19,31,3,2016,3,0


#### **_ACTUAL MODEL TRAINING_**


#### Part 1: create random forest ensemble on everything

In [4]:
# create features and to be predicted value
y = df_pm10['value']
X = df_pm10[['hour', 'year', 'day_of_week', 'is_weekend']]

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree

In [6]:
rf = RandomForestRegressor(max_depth=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.028647832981405164

In [7]:
rf.score(X_test, y_test)

0.01779184355488106

In [8]:
rf = RandomForestRegressor(max_depth=32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.073208021265873

In [9]:
rf.score(X_test, y_test)
# R2 score: vergleichen Modellvorhersagen mit der Abweichung vom Mittelwert, im Vergleich mit absoluten banalsten Mittelwert vorhersage
# d.h. Vergleicht die Vorhersage des Modells vs einfach den absoluten Mittelwert für die X vorherzusagen
# je höher desto besser

0.007129499347508728

In [10]:
#rf.estimator.tree_.max_depth

#### CROSS-VALIDATION & HYPERPARAMETER TUNING

In [11]:
# cross validate
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, root_mean_squared_error

# Create a RandomForestRegressor
random_forest = RandomForestRegressor(max_depth=30)

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}
# Perform cross-validation
# scores = cross_validate(decision_tree, X, y, cv=2, scoring=scorers, return_train_score=True)
scores = cross_validate(random_forest, X, y, cv=5, scoring=scorers, return_train_score=True)

# Print the results
# TODO use math-lib to turn numbers posivitve instead of "-1*" cuz it affects time oppositely 
print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {-1*np.mean(score):.2f} (+/- {np.std(score):.2f})")

Cross-validation scores:
fit_time: -7.10 (+/- 0.19)
score_time: -0.30 (+/- 0.01)
test_mean_absolute_error: 9.33 (+/- 0.64)
train_mean_absolute_error: 8.51 (+/- 0.17)
test_median_absolute_error: 7.53 (+/- 0.63)
train_median_absolute_error: 6.73 (+/- 0.15)
test_root_mean_squared_error: 13.72 (+/- 0.53)
train_root_mean_squared_error: 12.88 (+/- 0.24)
test_mean_squared_error: 188.46 (+/- 14.91)
train_mean_squared_error: 166.04 (+/- 6.09)


- overfitting extreme: perfoms well on train data but terrible on test data

In [12]:
# average pm10 value for comparison to the error rates
average_pm10 = df_pm10['value'].mean()
print(f"Average PM10 value: {average_pm10:.2f}")

Average PM10 value: 22.70


In [13]:
median_pm10= df_pm10['value'].median()
print(f"Median PM10 value: {median_pm10:.2f}")

Median PM10 value: 20.00


In [14]:
#----------------------------------------------------
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.datasets import make_regression
import numpy as np

param_grid = {
    'max_depth': range(5, 50, 5),
    'min_samples_leaf': [5, 10, 15, 20, 25, 30, 40, 50],
    'max_features': ["sqrt", "log2", "auto"], 
    'random_state': [123]
}

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}

# Initialize the RandomForestRegressor
regressor = RandomForestRegressor()

# Initialize GridSearchCV with the regressor and the parameter grid
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, scoring='neg_median_absolute_error')

# Fit GridSearchCV to the data
grid_search.fit(X, y)

# Get the best estimator from the grid search
best_regressor = grid_search.best_estimator_

# Perform cross-validation with the best estimator and the scoring functions
scores = cross_validate(best_regressor, X, y, cv=5, scoring=scorers, return_train_score=True)

print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {-1*np.mean(score):.2f} (+/- {np.std(score):.2f})")

KeyboardInterrupt: 

decent error values, no strong overfitting

In [None]:
best_regressor

In [None]:
# TODO run again with higher min_sample_leaf?