In [5]:
# GET ALL THE JSONS INTO ONE DATAFRAME
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import glob

In [6]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("mc124_data", "*.json")
files = glob.glob(file_path_mc124)

# Create empty list to store dataframes
li_all_files = []

# Loop through list of files and read each one into a dataframe and append to list
for f in files:
    # Read in json
    temp_df = pd.read_json(f)
    # Append df to list
    li_all_files.append(temp_df)

# Optionally concatenate all dataframes into one if needed
if li_all_files:
    combined_df = pd.concat(li_all_files)
    print(f'Combined dataframe shape: {combined_df.shape}')
else:
    print('No dataframes were created.')

Combined dataframe shape: (542555, 6)


In [7]:
# FILTER BY PARTICLE AND ONLY KEEP THE DATETIME, STATION, PERIOD AND VALUE FEATURE SINCE THE REST ARE CONSTANT INFORMATION (station, core, component, period)
df_reduced = combined_df[['datetime', 'station', 'core', 'value']]
df_reduced.sample(3)

# CUT OFF THE TIMEZONE INFORMATION FROM THE DATETIME TO AVOID CONVERSION ISSUES DUE TO TIME CHANGE IN MARCH AND OCTOBER
df_reduced.loc[:, 'datetime'] = df_reduced['datetime'].astype(str).str.slice(0, 19)
#df_pm10_reduced.loc[:, 'datetime'] = pd.to_datetime(df_pm10_reduced['datetime'], format='mixed')
df_reduced['datetime'] = pd.to_datetime(df_reduced['datetime'], format='mixed')
df_reduced.loc[:, 'datetime'] = df_reduced['datetime'].dt.tz_localize(None)
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 542555 entries, 0 to 3654
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   datetime  542555 non-null  datetime64[ns]
 1   station   542555 non-null  object        
 2   core      542555 non-null  object        
 3   value     539422 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 20.7+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['datetime'] = pd.to_datetime(df_reduced['datetime'], format='mixed')


In [8]:
#use loc to add it to every for every row
df_reduced['hour'] = df_reduced['datetime'].dt.strftime('%H')  # Hour (00-23)
df_reduced['day'] = df_reduced['datetime'].dt.strftime('%d')  # Day of the month (01-31)
df_reduced['month'] = df_reduced['datetime'].dt.strftime('%m')  # Month (01-12)
df_reduced['year'] = df_reduced['datetime'].dt.strftime('%Y')  # Month (01-12)
df_reduced.sample(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['hour'] = df_reduced['datetime'].dt.strftime('%H')  # Hour (00-23)


Unnamed: 0,datetime,station,core,value,hour,day,month,year
2607,2019-08-10 06:00:00,mc124,no2,24.0,6,10,8,2019
509,2020-10-27 18:00:00,mc124,nox,108.0,18,27,10,2020
1028,2014-09-16 17:00:00,mc124,nox,20.0,17,16,9,2014


In [9]:
# add day of the week to dataframe
import calendar

days = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}

df_daytime = df_reduced
# convert the 'day', 'month', and 'year' columns to integers
df_daytime['day'] = df_reduced['day'].astype(int)
df_daytime['month'] = df_reduced['month'].astype(int)
df_daytime['year'] = df_reduced['year'].astype(int)

# function to determine the day of the week
def get_day_of_week(row):
    return calendar.weekday(row['year'], row['month'], row['day'])

# function to determine if day is weekday or weekend 
def is_weekend(day_number):
    return 1 if day_number >= 5 else 0
    #day_number >= 5 ? 1 : 0
    
# apply the functions to create the new columns
df_daytime['day_of_week'] = df_daytime.apply(get_day_of_week, axis=1)
df_daytime['is_weekend'] = df_daytime['day_of_week'].apply(is_weekend)

df_daytime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 542555 entries, 0 to 3654
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   datetime     542555 non-null  datetime64[ns]
 1   station      542555 non-null  object        
 2   core         542555 non-null  object        
 3   pm10_value   539422 non-null  float64       
 4   hour         542555 non-null  object        
 5   day          542555 non-null  int32         
 6   month        542555 non-null  int32         
 7   year         542555 non-null  int32         
 8   day_of_week  542555 non-null  int64         
 9   is_weekend   542555 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(2), object(3)
memory usage: 39.3+ MB


In [10]:
file_path_berlin = os.path.join("..", "winddaten_berlin","produkt_wind_399_akt.txt")# join because different os use either \ or / as file path seperators
weather_station = pd.read_csv(file_path_berlin, names=['stations_id','date','quality_level','structure_version', 'wind_speed', 'wind_direction', 'eor'], skiprows=1, sep=';') 
weather_station.sample(5)

Unnamed: 0,stations_id,date,quality_level,structure_version,wind_speed,wind_direction,eor
17707,399,2017082909,2,0,6.0,150,eor
34205,399,2019072012,2,0,5.2,160,eor
53930,399,2022050711,2,0,5.8,260,eor
31475,399,2019032701,2,0,9.5,310,eor
25337,399,2018071407,2,0,5.7,310,eor


In [11]:
# Convert the 'dates' column to datetime and store it in a new column 'datetime'
weather_station.loc[:,'datetime'] = pd.to_datetime(weather_station['date'], format='%Y%m%d%H')
weather_station.sample(5)

Unnamed: 0,stations_id,date,quality_level,structure_version,wind_speed,wind_direction,eor,datetime
33178,399,2019060717,2,0,6.1,130,eor,2019-06-07 17:00:00
20265,399,2017121323,2,0,16.3,190,eor,2017-12-13 23:00:00
64406,399,2024041308,2,0,10.5,260,eor,2024-04-13 08:00:00
13641,399,2017031202,2,0,6.7,110,eor,2017-03-12 02:00:00
45747,399,2021053112,2,0,3.1,30,eor,2021-05-31 12:00:00


In [12]:
# Filter neccessary rows
weather_station_reduced = weather_station[['datetime','wind_speed','wind_direction']]
weather_station_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65861 entries, 0 to 65860
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   datetime        65861 non-null  datetime64[ns]
 1   wind_speed      65861 non-null  float64       
 2   wind_direction  65861 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 1.5 MB


In [13]:
df_merged = pd.merge(df_reduced, weather_station_reduced, on='datetime', how='outer')
# Convert object columns to numeric
df_merged['hour'] = pd.to_numeric(df_merged['hour'])
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542865 entries, 0 to 542864
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   datetime        542865 non-null  datetime64[ns]
 1   station         542555 non-null  object        
 2   core            542555 non-null  object        
 3   pm10_value      539422 non-null  float64       
 4   hour            542555 non-null  float64       
 5   day             542555 non-null  float64       
 6   month           542555 non-null  float64       
 7   year            542555 non-null  float64       
 8   day_of_week     542555 non-null  float64       
 9   is_weekend      542555 non-null  float64       
 10  wind_speed      311477 non-null  float64       
 11  wind_direction  311477 non-null  float64       
dtypes: datetime64[ns](1), float64(9), object(2)
memory usage: 49.7+ MB


In [14]:
df_merged.head(3)

Unnamed: 0,datetime,station,core,pm10_value,hour,day,month,year,day_of_week,is_weekend,wind_speed,wind_direction
0,2009-01-31 23:00:00,mc124,no2,15.0,23.0,31.0,1.0,2009.0,5.0,1.0,,
1,2009-01-31 23:00:00,mc124,no,4.0,23.0,31.0,1.0,2009.0,5.0,1.0,,
2,2009-01-31 23:00:00,mc124,nox,21.0,23.0,31.0,1.0,2009.0,5.0,1.0,,


In [15]:
df_merged.to_csv("df_merged.csv", index=False)