### Analysis wind information and pm10

In [1]:
# GET ALL THE JSONS INTO ONE DATAFRAME
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import glob

#### Get all the pm10 files and keep columns which contain pm10

In [2]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("mc124_data", "*.json")
files = glob.glob(file_path_mc124)

# Create empty list to store dataframes
li_all_files = []

# Loop through list of files and read each one into a dataframe and append to list
for f in files:
    # Read in json
    temp_df = pd.read_json(f)
    # Append df to list
    li_all_files.append(temp_df)

# Optionally concatenate all dataframes into one if needed
if li_all_files:
    combined_df = pd.concat(li_all_files)
    print(f'Combined dataframe shape: {combined_df.shape}')
else:
    print('No dataframes were created.')

Combined dataframe shape: (542555, 6)


In [3]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 542555 entries, 0 to 3654
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   datetime   542555 non-null  object 
 1   station    542555 non-null  object 
 2   core       542555 non-null  object 
 3   component  542555 non-null  object 
 4   period     542555 non-null  object 
 5   value      539422 non-null  float64
dtypes: float64(1), object(5)
memory usage: 29.0+ MB


In [4]:
# FILTER BY PARTICLE AND ONLY KEEP THE DATETIME, STATION, PERIOD AND VALUE FEATURE SINCE THE REST ARE CONSTANT INFORMATION (station, core, component, period)
df_reduced = combined_df[['datetime', 'station', 'core', 'value']]
df_reduced.sample(3)

Unnamed: 0,datetime,station,core,value
548,2022-04-26 10:00:00+02:00,mc124,no,12.0
743,2021-03-25T18:00:00+01:00,mc124,no,57.0
52,2015-12-31 06:00:00+01:00,mc124,no,4.0


In [5]:
# CUT OFF THE TIMEZONE INFORMATION FROM THE DATETIME TO AVOID CONVERSION ISSUES DUE TO TIME CHANGE IN MARCH AND OCTOBER
df_reduced.loc[:, 'datetime'] = df_reduced['datetime'].astype(str).str.slice(0, 19)
#df_pm10_reduced.loc[:, 'datetime'] = pd.to_datetime(df_pm10_reduced['datetime'], format='mixed')
df_reduced['datetime'] = pd.to_datetime(df_reduced['datetime'], format='mixed')
df_reduced.loc[:, 'datetime'] = df_reduced['datetime'].dt.tz_localize(None)
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 542555 entries, 0 to 3654
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   datetime  542555 non-null  datetime64[ns]
 1   station   542555 non-null  object        
 2   core      542555 non-null  object        
 3   value     539422 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 20.7+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['datetime'] = pd.to_datetime(df_reduced['datetime'], format='mixed')


In [6]:
df_reduced.sample(5)

Unnamed: 0,datetime,station,core,value
1888,2010-06-04 18:00:00,mc124,no,81.0
712,2015-04-21 02:00:00,mc124,no,19.0
292,2018-09-28 13:00:00,mc124,no2,41.0
2220,2014-10-01 03:00:00,mc124,no2,25.0
3551,2018-07-02 09:00:00,mc124,pm2,5.0


#### Load wind data and filter for wind speed and wind direction

##### Erklärung Windrichtung  
https://www.dwd.de/DE/service/lexikon/Functions/glossar.html?lv3=103182&lv2=102936  
Die Windrichtung wird bestimmt nach dem Polarwinkel (Azimut). Zur Richtungsangabe benutzt man die 360 Grad Skala des Kreises.   
Alle Richtungsangaben in Grad sind rechtweisend auf geographisch Nord bezogen, d.h.   
Ost  =  90 Grad,  
Süd  =  180 Grad,  
West =  270 Grad,   
Nord =  360 Grad.  

In [7]:
file_path_berlin = os.path.join("..", "winddaten_berlin","produkt_wind_399_akt.txt")# join because different os use either \ or / as file path seperators
weather_station = pd.read_csv(file_path_berlin, names=['stations_id','date','quality_level','structure_version', 'wind_speed', 'wind_direction', 'eor'], skiprows=1, sep=';') 
# TODO rename names
weather_station.sample(5)

Unnamed: 0,stations_id,date,quality_level,structure_version,wind_speed,wind_direction,eor
61489,399,2023121208,2,0,8.3,270,eor
1179,399,2015100806,2,0,10.3,110,eor
4773,399,2016030623,2,0,8.0,230,eor
27453,399,2018101011,2,0,6.8,130,eor
57404,399,2022092913,2,0,2.5,190,eor


In [8]:
# Convert the 'dates' column to datetime and store it in a new column 'datetime'
weather_station.loc[:,'datetime'] = pd.to_datetime(weather_station['date'], format='%Y%m%d%H')
weather_station.sample(5)

Unnamed: 0,stations_id,date,quality_level,structure_version,wind_speed,wind_direction,eor,datetime
7331,399,2016062203,2,0,6.8,220,eor,2016-06-22 03:00:00
5086,399,2016032000,2,0,10.2,280,eor,2016-03-20 00:00:00
33764,399,2019070203,2,0,11.8,280,eor,2019-07-02 03:00:00
52629,399,2022031406,2,0,18.2,150,eor,2022-03-14 06:00:00
44526,399,2021040618,2,0,11.7,270,eor,2021-04-06 18:00:00


In [9]:
# Filter neccessary rows
weather_station_reduced = weather_station[['datetime','wind_speed','wind_direction']]
weather_station_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65861 entries, 0 to 65860
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   datetime        65861 non-null  datetime64[ns]
 1   wind_speed      65861 non-null  float64       
 2   wind_direction  65861 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 1.5 MB


In [10]:
weather_station_reduced.sample(5)

Unnamed: 0,datetime,wind_speed,wind_direction
8175,2016-07-27 07:00:00,10.8,110
65148,2024-05-14 08:00:00,8.0,130
55062,2022-06-23 23:00:00,14.9,110
47789,2021-08-24 14:00:00,6.4,30
53904,2022-05-06 09:00:00,2.0,300


#### Merge df_reduced and weather_station_reduced based on datetime/DATETIME

In [11]:
df_merged = pd.merge(df_reduced, weather_station_reduced, on='datetime', how='outer')
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542865 entries, 0 to 542864
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   datetime        542865 non-null  datetime64[ns]
 1   station         542555 non-null  object        
 2   core            542555 non-null  object        
 3   value           539422 non-null  float64       
 4   wind_speed      311477 non-null  float64       
 5   wind_direction  311477 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 24.9+ MB


In [12]:
df_merged.sample(5)

Unnamed: 0,datetime,station,core,value,wind_speed,wind_direction
157084,2014-12-09 23:00:00,mc124,no,71.0,,
382481,2020-10-27 15:00:00,mc124,pm2,8.0,10.1,210.0
463888,2022-08-18 04:00:00,mc124,no,2.0,5.3,200.0
118508,2013-07-27 19:00:00,mc124,nox,44.0,,
209742,2016-10-22 21:00:00,mc124,no,73.0,2.2,190.0


In [13]:
# FILTER OUT EVERYTHING BEFORE THE START OF THE WIND MEASUREMENTS

In [14]:
#Deleting all rows before March 2016 since useful pm10 data starts in March 2016
#TODO check when other particles where first measured
start_date = '2016-03-01'
start_date = pd.Timestamp(start_date)
df_filtered = df_merged[df_merged['datetime'] >= start_date]
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 354537 entries, 188328 to 542864
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   datetime        354537 non-null  datetime64[ns]
 1   station         354227 non-null  object        
 2   core            354227 non-null  object        
 3   value           352295 non-null  float64       
 4   wind_speed      297587 non-null  float64       
 5   wind_direction  297587 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 18.9+ MB


In [15]:
df_filtered.head()

Unnamed: 0,datetime,station,core,value,wind_speed,wind_direction
188328,2016-03-31 23:00:00,mc124,pm10,16.0,8.3,40.0
188329,2016-03-31 23:00:00,mc124,no2,31.0,8.3,40.0
188330,2016-03-31 23:00:00,mc124,no,8.0,8.3,40.0
188331,2016-03-31 23:00:00,mc124,nox,43.0,8.3,40.0
188332,2016-03-31 22:00:00,mc124,pm10,22.0,8.4,40.0


#### Create heatmap and scatter plots to visualise relationship betwen pm10 wind_speed and wind_direction

In [28]:
#df_correlation_matrix = df_filtered.drop('station')
df_filtered = df_filtered.drop(columns=['station'])
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 354537 entries, 188328 to 542864
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   datetime        354537 non-null  datetime64[ns]
 1   core            354227 non-null  object        
 2   value           352295 non-null  float64       
 3   wind_speed      297587 non-null  float64       
 4   wind_direction  297587 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 24.3+ MB


In [29]:
# Create correlation matrix
correlation_matrix = df_filtered.set_index('datetime').corr()
print("Correlation Matrix:")
print(correlation_matrix)

ValueError: could not convert string to float: 'pm10'

In [None]:
# show heatmap with seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# TODO y-data profiling

In [None]:
# Creating a seaborn plot
df_filtered = df_filtered.set_index('datetime').sort_index()

plt.figure(figsize=(10, 6))
# Plotting data
plt.plot(df_filtered['datetime'], df_filtered['pm10'], marker='o', linestyle='-', color='b', label='PM10') # datetime is 
plt.plot(df_filtered['datetime'], df_filtered['wind_speed'], marker='o', linestyle='-', color='r', label='Wind Speed')
plt.plot(df_filtered['datetime'], df_filtered['wind_direction'], marker='o', linestyle='-', color='y', label='Wind Direction')

# Adding annotations
for x, y1, y2, y3 in zip(df_filtered['datetime'], df_filtered['pm10'], df_filtered['wind_speed'], df_filtered['wind_direction']):
    plt.text(x, y1, f'{y1:.0f}', ha='center', va='bottom', fontsize=9)  # PM10 annotations
    plt.text(x, y2, f'{y2:.2f}', ha='center', va='bottom', fontsize=9)  # Wind Speed annotations
    plt.text(x, y3, f'{y3:.0f}', ha='center', va='bottom', fontsize=9)  # Wind Direction annotations

# Adding labels and title
plt.xlabel('Datetime')
plt.ylabel('Value')
plt.title('PM10, Wind Speed, and Wind Direction')
plt.legend()

# Adding a second y-axis for total counts (example)
#ax2 = plt.twinx()
#ax2.set_ylabel('Total Counts')
#ax2.plot(df_filtered['datetime'], df_filtered['wind_speed'], linestyle='--', color='g', label='Total Counts')
#ax2.legend(loc='upper right')

# Show plot
plt.grid(True)
plt.tight_layout()
plt.show()