## Decision Tree V1
- all datetime feature
- random cross validation

**Hyperparameter for tweeking:**
- maximale Baumtiefe
- Mindestanzahl an Daten pro Blatt
- Mindestanzahl an Daten pro Knoten
- Features

**Regularisierung:**
- Pruning

**Metriken:**
- mean_absolute_error
- mean_squared_error
- median_absolute_error

#### **_PREPARATION_**

In [1]:
# GET ALL THE JSONS INTO ONE DATAFRAME
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import glob

In [2]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("..", "mc124_data", "*.json")
files = glob.glob(file_path_mc124)

# Create empty list to store dataframes
li_all_files = []

# Loop through list of files and read each one into a dataframe and append to list
for f in files:
    # Read in json
    temp_df = pd.read_json(f)
    # Append df to list
    li_all_files.append(temp_df)

# Optionally concatenate all dataframes into one if needed
if li_all_files:
    combined_df = pd.concat(li_all_files)
    print(f'Combined dataframe shape: {combined_df.shape}')
else:
    print('No dataframes were created.')

Combined dataframe shape: (542555, 6)


In [3]:
combined_df.sample()

Unnamed: 0,datetime,station,core,component,period,value
3675,2020-08-01 08:00:00+02:00,mc124,pm10,pm10_1h,1h,23.0


In [4]:
# FILTER FOR PM10 ONLY
combined_df_pm10 = combined_df[(combined_df['core'] == 'pm10')]
combined_df_pm10.sample(4)

Unnamed: 0,datetime,station,core,component,period,value
1095,2019-08-22 20:00:00+02:00,mc124,pm10,pm10_1h,1h,14.0
795,2023-02-22 08:00:00+01:00,mc124,pm10,pm10_1h,1h,24.0
1790,2020-01-17 01:00:00+01:00,mc124,pm10,pm10_1h,1h,27.0
1800,2018-06-15 23:00:00+02:00,mc124,pm10,pm10_1h,1h,22.0


In [5]:
# FILTER BY PARTICLE AND ONLY KEEP THE DATETIME, STATION, PERIOD AND VALUE FEATURE SINCE THE REST ARE CONSTANT INFORMATION (station, core, component, period)
df_reduced = combined_df_pm10[['datetime', 'station', 'core', 'value']]
df_reduced.sample(3)

# CUT OFF THE TIMEZONE INFORMATION FROM THE DATETIME TO AVOID CONVERSION ISSUES DUE TO TIME CHANGE IN MARCH AND OCTOBER
df_reduced.loc[:, 'datetime'] = df_reduced['datetime'].astype(str).str.slice(0, 19)
#df_pm10_reduced.loc[:, 'datetime'] = pd.to_datetime(df_pm10_reduced['datetime'], format='mixed')
df_reduced['datetime'] = pd.to_datetime(df_reduced['datetime'], format='mixed')
df_reduced.loc[:, 'datetime'] = df_reduced['datetime'].dt.tz_localize(None)
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72314 entries, 0 to 3650
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  72314 non-null  datetime64[ns]
 1   station   72314 non-null  object        
 2   core      72314 non-null  object        
 3   value     71471 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 2.8+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['datetime'] = pd.to_datetime(df_reduced['datetime'], format='mixed')


In [6]:
df_reduced.sample(3)

Unnamed: 0,datetime,station,core,value
1125,2020-12-22 14:00:00,mc124,pm10,7.0
1684,2016-04-13 10:00:00,mc124,pm10,66.0
920,2020-08-24 07:00:00,mc124,pm10,17.0


In [7]:
#use loc to add it to every for every row
df_reduced['hour'] = df_reduced['datetime'].dt.strftime('%H')  # Hour (00-23)
df_reduced['day'] = df_reduced['datetime'].dt.strftime('%d')  # Day of the month (01-31)
df_reduced['month'] = df_reduced['datetime'].dt.strftime('%m')  # Month (01-12)
df_reduced['year'] = df_reduced['datetime'].dt.strftime('%Y')  # Month (01-12)
df_reduced.sample(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['hour'] = df_reduced['datetime'].dt.strftime('%H')  # Hour (00-23)


Unnamed: 0,datetime,station,core,value,hour,day,month,year
3435,2022-09-02 08:00:00,mc124,pm10,12.0,8,2,9,2022
1110,2022-03-22 16:00:00,mc124,pm10,28.0,16,22,3,2022
1190,2022-04-21 01:00:00,mc124,pm10,14.0,1,21,4,2022


In [8]:
# add day of the week to dataframe
import calendar

days = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}

df_daytime = df_reduced
# convert the 'day', 'month', and 'year' columns to integers
df_daytime['day'] = df_reduced['day'].astype(int)
df_daytime['month'] = df_reduced['month'].astype(int)
df_daytime['year'] = df_reduced['year'].astype(int)

# function to determine the day of the week
def get_day_of_week(row):
    return calendar.weekday(row['year'], row['month'], row['day'])

# function to determine if day is weekday or weekend 
def is_weekend(day_number):
    return 1 if day_number >= 5 else 0
    #day_number >= 5 ? 1 : 0
    
# apply the functions to create the new columns
df_daytime['day_of_week'] = df_daytime.apply(get_day_of_week, axis=1)
df_daytime['is_weekend'] = df_daytime['day_of_week'].apply(is_weekend)

df_daytime.rename(columns={'value': 'pm10_value'}, inplace=True)

df_daytime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72314 entries, 0 to 3650
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   datetime     72314 non-null  datetime64[ns]
 1   station      72314 non-null  object        
 2   core         72314 non-null  object        
 3   pm10_value   71471 non-null  float64       
 4   hour         72314 non-null  object        
 5   day          72314 non-null  int32         
 6   month        72314 non-null  int32         
 7   year         72314 non-null  int32         
 8   day_of_week  72314 non-null  int64         
 9   is_weekend   72314 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(2), object(3)
memory usage: 5.2+ MB


In [9]:
df_daytime.dropna(inplace=True)
df_daytime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71471 entries, 0 to 3650
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   datetime     71471 non-null  datetime64[ns]
 1   station      71471 non-null  object        
 2   core         71471 non-null  object        
 3   pm10_value   71471 non-null  float64       
 4   hour         71471 non-null  object        
 5   day          71471 non-null  int32         
 6   month        71471 non-null  int32         
 7   year         71471 non-null  int32         
 8   day_of_week  71471 non-null  int64         
 9   is_weekend   71471 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(2), object(3)
memory usage: 5.2+ MB


In [10]:
df_daytime.sample(5)

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend
2740,2018-04-08 03:00:00,mc124,pm10,21.0,3,8,4,2018,6,1
2785,2018-01-08 18:00:00,mc124,pm10,22.0,18,8,1,2018,0,0
2455,2023-08-11 12:00:00,mc124,pm10,14.0,12,11,8,2023,4,0
150,2020-04-29 17:00:00,mc124,pm10,30.0,17,29,4,2020,2,0
2500,2018-10-11 03:00:00,mc124,pm10,42.0,3,11,10,2018,3,0


#### **_ACTUAL MODEL TRAINING_**


#### Part 1: create decision tree on everything

In [11]:
# create features and to be predicted value
y = df_daytime['pm10_value']
X = df_daytime[['hour', 'year', 'day_of_week', 'is_weekend']]

# SCALING?

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree

In [13]:
dt = DecisionTreeRegressor(max_depth=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.02627210844581085

In [14]:
dt.score(X_test, y_test)

0.01678084792372192

In [15]:
dt = DecisionTreeRegressor(max_depth=2000000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.07351349770706728

In [16]:
dt.score(X_test, y_test)

0.008188842585578615

In [17]:
dt.tree_.max_depth

17

In [23]:
# cross validate
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, root_mean_squared_error

# Create a DecisionTreeRegressor
decision_tree = DecisionTreeRegressor(max_depth=30)

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
    'root_mean_squared_error': make_scorer(root_mean_squared_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}

# Perform cross-validation
# scores = cross_validate(decision_tree, X, y, cv=5, scoring=scorers, return_train_score=True)
scores = cross_validate(decision_tree, X, y, cv=5, scoring=scorers)

# Print the results
print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {np.mean(score):.2f} (+/- {np.std(score):.2f})")

Cross-validation scores:
fit_time: 0.05 (+/- 0.01)
score_time: 0.02 (+/- 0.00)
test_mean_absolute_error: -9.31 (+/- 0.63)
test_median_absolute_error: -7.52 (+/- 0.61)
test_root_mean_squared_error: -13.70 (+/- 0.55)
test_mean_squared_error: -188.00 (+/- 15.35)


In [20]:
# average pm10 value for comparison to the error rates
average_pm10 = df_daytime['pm10_value'].mean()
print(f"Average PM10 value: {average_pm10:.2f}")

Average PM10 value: 22.70


In [24]:
median_pm10= df_daytime['pm10_value'].median()
print(f"Median PM10 value: {median_pm10:.2f}")

Median PM10 value: 20.00


In [21]:
print('rerun2')

rerun2
