## Random Forest V1
- all datetime features
- fine dust data (h)
- fine dust data (h-1)
- random cross validation

#### **_PREPARATION_**

In [1]:
# GET ALL THE JSONS INTO ONE DATAFRAME
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json

In [2]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("..", "fine_dust_complete", ".csv")
df = pd.read_csv('../../data/df_h-1_complete_mc124.csv', parse_dates=['datetime'])

df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53312 entries, 11946 to 65475
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   datetime            53312 non-null  datetime64[ns]
 1   station             53312 non-null  object        
 2   hour                53312 non-null  float64       
 3   day                 53312 non-null  float64       
 4   month               53312 non-null  float64       
 5   year                53312 non-null  float64       
 6   day_of_week         53312 non-null  float64       
 7   is_weekend          53312 non-null  float64       
 8   no2                 53312 non-null  float64       
 9   no                  53312 non-null  float64       
 10  nox                 53312 non-null  float64       
 11  pm10                53312 non-null  float64       
 12  pm2.5               53312 non-null  float64       
 13  wind_speed          53312 non-null  float64    

#### **MODEL TRAINING_**

#### Part 1: add training features

In [3]:
# create features and to be predicted value
y = df['pm10']
X = df[['hour', 'day', 'month', 'year', 'day_of_week', 'is_weekend', 'no2', 'no', 'nox','no2_h-1', 'no_h-1', 'nox_h-1', 'pm10_h-1', 'pm2.5_h-1']]

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree

In [5]:
rf = RandomForestRegressor(max_depth=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.680017675849135

In [6]:
rf.score(X_test, y_test)

0.7346642397617615

In [None]:
rf = RandomForestRegressor(max_depth=32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)
# R2 score: vergleichen Modellvorhersagen mit der Abweichung vom Mittelwert, im Vergleich mit absoluten banalsten Mittelwert vorhersage
# d.h. Vergleicht die Vorhersage des Modells vs einfach den absoluten Mittelwert für die X vorherzusagen
# je höher desto besser

In [None]:
#rf.estimator.tree_.max_depth

#### Part 2: CROSS-VALIDATION & HYPERPARAMETER TUNING

In [None]:
# cross validate
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, root_mean_squared_error

# Create a RandomForestRegressor
random_forest = RandomForestRegressor(max_depth=30)

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}
# Perform cross-validation
# scores = cross_validate(decision_tree, X, y, cv=2, scoring=scorers, return_train_score=True)
scores = cross_validate(random_forest, X, y, cv=5, scoring=scorers, return_train_score=True)

# Print the results
# TODO use math-lib to turn numbers posivitve instead of "-1*" cuz it affects time oppositely 
print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {-1*np.mean(score):.2f} (+/- {np.std(score):.2f})")

In [None]:
# average pm10 value for comparison to the error rates
average_pm10 = df['pm10'].mean()
print(f"Average PM10 value: {average_pm10:.2f}")

In [None]:
median_pm10= df['pm10'].median()
print(f"Median PM10 value: {median_pm10:.2f}")

In [None]:
#----------------------------------------------------
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, median_absolute_error, mean_squared_error
from sklearn.datasets import make_regression
import numpy as np

param_grid = {
    'max_depth': range(5, 50, 5),
    'min_samples_leaf': [5, 10, 15, 20, 25, 30, 40, 50],
    'max_features': ["sqrt", "log2"], 
    'random_state': [123]
}

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}

# Initialize the RandomForestRegressor
regressor = RandomForestRegressor()

# Initialize GridSearchCV with the regressor and the parameter grid
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, scoring='neg_median_absolute_error')

# Fit GridSearchCV to the data
grid_search.fit(X, y)

# Get the best estimator from the grid search
best_regressor = grid_search.best_estimator_

# Perform cross-validation with the best estimator and the scoring functions
scores = cross_validate(best_regressor, X, y, cv=5, scoring=scorers, return_train_score=True)

print("Cross-validation scores v1:")
for scorer, score in scores.items():
    print(f"{scorer}: {-1*np.mean(score):.2f} (+/- {np.std(score):.2f})")

In [None]:
best_regressor

In [None]:
print('finished')