## Decision Tree V2
- all datetime feature
- random cross validation

**Hyperparameter for tweeking:**
- maximale Baumtiefe
- Mindestanzahl an Daten pro Blatt
- Mindestanzahl an Daten pro Knoten
- Features

**Regularisierung:**
- Pruning

**Metriken:**
- R2 score
- mean_absolute_error
- mean_squared_error
- median_absolute_error

#### **_PREPARATION_**

#### 1. FINE DUST (pm10 only) DATA PREP

In [1]:
# GET ALL THE JSONS INTO ONE DATAFRAME
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import glob

In [2]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("..", "mc124_data", "*.json")
files = glob.glob(file_path_mc124)

# Create empty list to store dataframes
li_all_files = []

# Loop through list of files and read each one into a dataframe and append to list
for f in files:
    # Read in json
    temp_df = pd.read_json(f)
    # Append df to list
    li_all_files.append(temp_df)

# Optionally concatenate all dataframes into one if needed
if li_all_files:
    combined_df = pd.concat(li_all_files)
    print(f'Combined dataframe shape: {combined_df.shape}')
else:
    print('No dataframes were created.')

Combined dataframe shape: (542555, 6)


In [3]:
combined_df.sample()

Unnamed: 0,datetime,station,core,component,period,value
1632,2009-04-08 07:00:00+02:00,mc124,no2,no2_1h,1h,114.0


In [4]:
# FILTER FOR PM10 ONLY
combined_df_pm10 = combined_df[(combined_df['core'] == 'pm10')]
combined_df_pm10.sample(4)

Unnamed: 0,datetime,station,core,component,period,value
780,2016-09-22 20:00:00+02:00,mc124,pm10,pm10_1h,1h,22.0
2560,2016-04-04 07:00:00+02:00,mc124,pm10,pm10_1h,1h,48.0
324,2016-05-28 14:00:00+02:00,mc124,pm10,pm10_1h,1h,22.0
1950,2023-12-15 17:00:00+01:00,mc124,pm10,pm10_1h,1h,30.0


In [5]:
# FILTER BY PARTICLE AND ONLY KEEP THE DATETIME, STATION, PERIOD AND VALUE FEATURE SINCE THE REST ARE CONSTANT INFORMATION (station, core, component, period)
df_reduced = combined_df_pm10[['datetime', 'station', 'core', 'value']]
df_reduced.sample(3)

# CUT OFF THE TIMEZONE INFORMATION FROM THE DATETIME TO AVOID CONVERSION ISSUES DUE TO TIME CHANGE IN MARCH AND OCTOBER
df_reduced.loc[:, 'datetime'] = df_reduced['datetime'].astype(str).str.slice(0, 19)
#df_pm10_reduced.loc[:, 'datetime'] = pd.to_datetime(df_pm10_reduced['datetime'], format='mixed')
df_reduced['datetime'] = pd.to_datetime(df_reduced['datetime'], format='mixed')
df_reduced.loc[:, 'datetime'] = df_reduced['datetime'].dt.tz_localize(None)
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72314 entries, 0 to 3650
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  72314 non-null  datetime64[ns]
 1   station   72314 non-null  object        
 2   core      72314 non-null  object        
 3   value     71471 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 2.8+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['datetime'] = pd.to_datetime(df_reduced['datetime'], format='mixed')


In [6]:
df_reduced.sample(3)

Unnamed: 0,datetime,station,core,value
1990,2019-08-15 09:00:00,mc124,pm10,27.0
2305,2023-04-11 18:00:00,mc124,pm10,10.0
2285,2020-06-11 22:00:00,mc124,pm10,34.0


In [7]:
#use loc to add it to every for every row
df_reduced['hour'] = df_reduced['datetime'].dt.strftime('%H')  # Hour (00-23)
df_reduced['day'] = df_reduced['datetime'].dt.strftime('%d')  # Day of the month (01-31)
df_reduced['month'] = df_reduced['datetime'].dt.strftime('%m')  # Month (01-12)
df_reduced['year'] = df_reduced['datetime'].dt.strftime('%Y')  # Month (01-12)
df_reduced.sample(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['hour'] = df_reduced['datetime'].dt.strftime('%H')  # Hour (00-23)


Unnamed: 0,datetime,station,core,value,hour,day,month,year
2730,2019-08-09 05:00:00,mc124,pm10,20.0,5,9,8,2019
1070,2017-03-23 00:00:00,mc124,pm10,17.0,0,23,3,2017
3165,2018-10-05 14:00:00,mc124,pm10,25.0,14,5,10,2018


In [8]:
# add day of the week to dataframe
import calendar

days = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}

df_daytime = df_reduced
# convert the 'day', 'month', and 'year' columns to integers
df_daytime['day'] = df_reduced['day'].astype(int)
df_daytime['month'] = df_reduced['month'].astype(int)
df_daytime['year'] = df_reduced['year'].astype(int)

# function to determine the day of the week
def get_day_of_week(row):
    return calendar.weekday(row['year'], row['month'], row['day'])

# function to determine if day is weekday or weekend 
def is_weekend(day_number):
    return 1 if day_number >= 5 else 0
    #day_number >= 5 ? 1 : 0
    
# apply the functions to create the new columns
df_daytime['day_of_week'] = df_daytime.apply(get_day_of_week, axis=1)
df_daytime['is_weekend'] = df_daytime['day_of_week'].apply(is_weekend)

df_daytime.rename(columns={'value': 'pm10_value'}, inplace=True)

df_daytime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72314 entries, 0 to 3650
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   datetime     72314 non-null  datetime64[ns]
 1   station      72314 non-null  object        
 2   core         72314 non-null  object        
 3   pm10_value   71471 non-null  float64       
 4   hour         72314 non-null  object        
 5   day          72314 non-null  int32         
 6   month        72314 non-null  int32         
 7   year         72314 non-null  int32         
 8   day_of_week  72314 non-null  int64         
 9   is_weekend   72314 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(2), object(3)
memory usage: 5.2+ MB


In [9]:
df_daytime.dropna(inplace=True)
df_daytime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71471 entries, 0 to 3650
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   datetime     71471 non-null  datetime64[ns]
 1   station      71471 non-null  object        
 2   core         71471 non-null  object        
 3   pm10_value   71471 non-null  float64       
 4   hour         71471 non-null  object        
 5   day          71471 non-null  int32         
 6   month        71471 non-null  int32         
 7   year         71471 non-null  int32         
 8   day_of_week  71471 non-null  int64         
 9   is_weekend   71471 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(2), object(3)
memory usage: 5.2+ MB


In [10]:
df_daytime.head()

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend
0,2016-03-31 23:00:00,mc124,pm10,16.0,23,31,3,2016,3,0
4,2016-03-31 22:00:00,mc124,pm10,22.0,22,31,3,2016,3,0
8,2016-03-31 21:00:00,mc124,pm10,22.0,21,31,3,2016,3,0
12,2016-03-31 20:00:00,mc124,pm10,24.0,20,31,3,2016,3,0
16,2016-03-31 19:00:00,mc124,pm10,21.0,19,31,3,2016,3,0


In [11]:
df_daytime.tail(50)

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend
3405,2024-05-03 01:00:00,mc124,pm10,33.0,1,3,5,2024,4,0
3410,2024-05-03 00:00:00,mc124,pm10,33.0,0,3,5,2024,4,0
3415,2024-05-02 23:00:00,mc124,pm10,28.0,23,2,5,2024,3,0
3420,2024-05-02 22:00:00,mc124,pm10,28.0,22,2,5,2024,3,0
3425,2024-05-02 21:00:00,mc124,pm10,31.0,21,2,5,2024,3,0
3430,2024-05-02 20:00:00,mc124,pm10,28.0,20,2,5,2024,3,0
3435,2024-05-02 19:00:00,mc124,pm10,35.0,19,2,5,2024,3,0
3440,2024-05-02 18:00:00,mc124,pm10,34.0,18,2,5,2024,3,0
3445,2024-05-02 17:00:00,mc124,pm10,28.0,17,2,5,2024,3,0
3450,2024-05-02 16:00:00,mc124,pm10,27.0,16,2,5,2024,3,0


#### 2. ADDING WIND DATA TO THE DATAFRAME

In [12]:
file_path_berlin = os.path.join("..", "..", "winddaten_berlin","produkt_wind_399_akt.txt")# join because different os use either \ or / as file path seperators
weather_station = pd.read_csv(file_path_berlin, names=['stations_id','date','quality_level','structure_version', 'wind_speed', 'wind_direction', 'eor'], skiprows=1, sep=';') 
# TODO rename names
weather_station.sample(5)

Unnamed: 0,stations_id,date,quality_level,structure_version,wind_speed,wind_direction,eor
32331,399,2019050309,2,0,10.2,290,eor
22814,399,2018033104,2,0,16.5,90,eor
15079,399,2017051100,2,0,2.1,10,eor
40975,399,2020110710,2,0,6.5,170,eor
19737,399,2017112123,2,0,7.2,210,eor


In [13]:
# Convert the 'dates' column to datetime and store it in a new column 'datetime'
weather_station.loc[:,'datetime'] = pd.to_datetime(weather_station['date'], format='%Y%m%d%H')
weather_station.sample(5)

Unnamed: 0,stations_id,date,quality_level,structure_version,wind_speed,wind_direction,eor,datetime
51372,399,2022012021,2,0,16.6,310,eor,2022-01-20 21:00:00
49295,399,2021102608,2,0,11.8,270,eor,2021-10-26 08:00:00
34745,399,2019122015,2,0,13.4,170,eor,2019-12-20 15:00:00
38714,399,2020080505,2,0,3.9,220,eor,2020-08-05 05:00:00
2354,399,2015112613,2,0,5.5,290,eor,2015-11-26 13:00:00


In [14]:
# Filter neccessary rows
weather_station_reduced = weather_station[['datetime','wind_speed','wind_direction']]
weather_station_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65861 entries, 0 to 65860
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   datetime        65861 non-null  datetime64[ns]
 1   wind_speed      65861 non-null  float64       
 2   wind_direction  65861 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 1.5 MB


In [15]:
weather_station_reduced.sample(5)

Unnamed: 0,datetime,wind_speed,wind_direction
19752,2017-11-22 14:00:00,12.1,230
45500,2021-05-21 05:00:00,15.0,230
13081,2017-02-16 18:00:00,8.2,270
604,2015-09-14 07:00:00,12.1,160
27443,2018-10-10 01:00:00,4.1,220


In [16]:
df_merged = pd.merge(df_reduced, weather_station_reduced, on='datetime', how='outer')
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77137 entries, 0 to 77136
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   datetime        77137 non-null  datetime64[ns]
 1   station         71471 non-null  object        
 2   core            71471 non-null  object        
 3   pm10_value      71471 non-null  float64       
 4   hour            71471 non-null  object        
 5   day             71471 non-null  float64       
 6   month           71471 non-null  float64       
 7   year            71471 non-null  float64       
 8   day_of_week     71471 non-null  float64       
 9   is_weekend      71471 non-null  float64       
 10  wind_speed      65861 non-null  float64       
 11  wind_direction  65861 non-null  float64       
dtypes: datetime64[ns](1), float64(8), object(3)
memory usage: 7.1+ MB


In [17]:
df_merged.sample(5)

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend,wind_speed,wind_direction
63732,2023-07-22 00:00:00,mc124,pm10,16.0,0,22.0,7.0,2023.0,5.0,1.0,,
59967,2023-02-27 16:00:00,mc124,pm10,19.0,16,27.0,2.0,2023.0,0.0,0.0,,
68794,2024-02-21 04:00:00,mc124,pm10,21.0,4,21.0,2.0,2024.0,2.0,0.0,15.7,250.0
22382,2018-10-15 07:00:00,mc124,pm10,38.0,7,15.0,10.0,2018.0,0.0,0.0,13.2,180.0
63695,2023-07-23 13:00:00,mc124,pm10,15.0,13,23.0,7.0,2023.0,6.0,1.0,,


In [18]:
#Deleting all rows before March 2016 since useful pm10 data starts in March 2016
#TODO check when other particles where first measured
start_date = '2016-03-01'
start_date = pd.Timestamp(start_date)
df_filtered = df_merged[df_merged['datetime'] >= start_date]
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72507 entries, 0 to 77136
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   datetime        72507 non-null  datetime64[ns]
 1   station         71471 non-null  object        
 2   core            71471 non-null  object        
 3   pm10_value      71471 non-null  float64       
 4   hour            71471 non-null  object        
 5   day             71471 non-null  float64       
 6   month           71471 non-null  float64       
 7   year            71471 non-null  float64       
 8   day_of_week     71471 non-null  float64       
 9   is_weekend      71471 non-null  float64       
 10  wind_speed      61231 non-null  float64       
 11  wind_direction  61231 non-null  float64       
dtypes: datetime64[ns](1), float64(8), object(3)
memory usage: 7.2+ MB


In [19]:
df_filtered.dropna(inplace=True)
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60195 entries, 0 to 71470
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   datetime        60195 non-null  datetime64[ns]
 1   station         60195 non-null  object        
 2   core            60195 non-null  object        
 3   pm10_value      60195 non-null  float64       
 4   hour            60195 non-null  object        
 5   day             60195 non-null  float64       
 6   month           60195 non-null  float64       
 7   year            60195 non-null  float64       
 8   day_of_week     60195 non-null  float64       
 9   is_weekend      60195 non-null  float64       
 10  wind_speed      60195 non-null  float64       
 11  wind_direction  60195 non-null  float64       
dtypes: datetime64[ns](1), float64(8), object(3)
memory usage: 6.0+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.dropna(inplace=True)


In [20]:
df_filtered.head()

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend,wind_speed,wind_direction
0,2016-03-31 23:00:00,mc124,pm10,16.0,23,31.0,3.0,2016.0,3.0,0.0,8.3,40.0
1,2016-03-31 22:00:00,mc124,pm10,22.0,22,31.0,3.0,2016.0,3.0,0.0,8.4,40.0
2,2016-03-31 21:00:00,mc124,pm10,22.0,21,31.0,3.0,2016.0,3.0,0.0,7.5,40.0
3,2016-03-31 20:00:00,mc124,pm10,24.0,20,31.0,3.0,2016.0,3.0,0.0,9.1,60.0
4,2016-03-31 19:00:00,mc124,pm10,21.0,19,31.0,3.0,2016.0,3.0,0.0,8.6,40.0


In [21]:
df_filtered.tail()

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend,wind_speed,wind_direction
71466,2024-05-01 04:00:00,mc124,pm10,30.0,4,1.0,5.0,2024.0,2.0,0.0,16.1,130.0
71467,2024-05-01 03:00:00,mc124,pm10,29.0,3,1.0,5.0,2024.0,2.0,0.0,13.2,140.0
71468,2024-05-01 02:00:00,mc124,pm10,28.0,2,1.0,5.0,2024.0,2.0,0.0,13.2,140.0
71469,2024-05-01 01:00:00,mc124,pm10,28.0,1,1.0,5.0,2024.0,2.0,0.0,12.7,150.0
71470,2024-05-01 00:00:00,mc124,pm10,29.0,0,1.0,5.0,2024.0,2.0,0.0,14.1,160.0


#### **_ACTUAL MODEL TRAINING_**


#### Part 1: create decision tree on everything

In [34]:
# create features and to be predicted value
y = df_filtered['pm10_value']
X = df_filtered[['hour', 'day', 'month', 'year', 'day_of_week', 'is_weekend', 'wind_speed', 'wind_direction']]

# SCALING?

In [35]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree

In [36]:
dt = DecisionTreeRegressor(max_depth=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.09226383585206144

In [37]:
dt.score(X_test, y_test)

0.09344963440594534

In [38]:
dt = DecisionTreeRegressor(max_depth=2000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

1.0

In [39]:
dt.score(X_test, y_test)

0.31068861293569483

In [41]:
# cross validate
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error

# scores = cross_validate(knn, data_scaled, iris.target, return_train_score=True, cv=5)
# scores
# train_scores = scores["train_score"]
# test_scores = scores["test_score"]
# train_scores.mean()
# test_scores.mean()
# test_scores.std()

#-------------------------------------------------------------------

# Create a DecisionTreeRegressor
decision_tree = DecisionTreeRegressor()

# Define your scoring functions
scorers = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
    'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False)
}

# Perform cross-validation
# scores = cross_validate(decision_tree, X, y, cv=5, scoring=scorers, return_train_score=True)
scores = cross_validate(decision_tree, X, y, cv=5, scoring=scorers)

# Print the results
print("Cross-validation scores:")
for scorer, score in scores.items():
    print(f"{scorer}: {np.mean(score):.2f} (+/- {np.std(score):.2f})")

Cross-validation scores:
fit_time: 0.22 (+/- 0.01)
score_time: 0.01 (+/- 0.00)
test_mean_absolute_error: -11.61 (+/- 0.65)
test_mean_squared_error: -316.71 (+/- 12.77)
test_median_absolute_error: -8.20 (+/- 0.75)


In [29]:
scores

{'fit_time': array([0.04698968, 0.03691483, 0.04752088, 0.0319562 , 0.04754853]),
 'score_time': array([0.        , 0.01561594, 0.        , 0.01510024, 0.        ]),
 'test_mean_absolute_error': array([ -9.35632579, -10.36344672,  -9.87811031,  -8.91270528,
         -9.00821467]),
 'train_mean_absolute_error': array([-8.70885117, -8.71851414, -8.96386713, -8.81761684, -8.99511768]),
 'test_mean_squared_error': array([-202.81350034, -194.99116575, -200.89173595, -173.68143511,
        -236.89498774]),
 'train_mean_squared_error': array([-179.96859116, -184.38581194, -181.55330981, -184.4160046 ,
        -171.83626287]),
 'test_median_absolute_error': array([-6.7       , -8.58823529, -9.        , -6.65853659, -7.1875    ]),
 'train_median_absolute_error': array([-6.86319073, -6.90384615, -7.07843137, -7.        , -7.18181818])}

## TODO: REMOVE CONSISTENTLY HEAVY OUTLIER
- EASTER
- CHRISTMAS
-> take one year and analyze month for month (daily data)
-> additional column: crazy_holiday


In [30]:
# average pm10 value for comparison to the error rates
average_pm10 = df_daytime['pm10_value'].mean()
print(f"Average PM10 value: {average_pm10:.2f}")

Average PM10 value: 22.70


In [31]:
print('rerun2')

rerun
