In [12]:
from data_preprocessing import preprocess_time_series, closest_merra2
import pandas as pd
import numpy as np
from config import time_series_dir, openaq_dir
from sklearn.model_selection import train_test_split
import torch

# Extracting MERRA-2

In [2]:
aerosols_variables = ['DUEXTTAU', 'SSSMASS25', 'SSSMASS', 'OCSMASS', 'BCSMASS', 'SSEXTTAU', 'TOTEXTTAU', 'BCEXTTAU', 'SUEXTTAU', 'OCEXTTAU', 'SO4SMASS', 'DUSMASS', 'DUSMASS25']

meteorology_variables = ['U2M', 'T500', 'PS', 'Q500', 'T10M', 'Q850', 'V2M', 'V10M', 'T850', 'U10M', 'QV2M', 'QV10M']

surface_flux_variables = ['PBLH']

In [3]:
df = pd.DataFrame()

for site in closest_merra2:
    print(f'Processing {site}')
    site_df = preprocess_time_series('20240101', '20240901', site, surface_flux_variables)
    site_df['site'] = site
    df = pd.concat([df, site_df])
    print()

Processing Baghdad Airnow
Processing 2024-01-01
Processing 2024-01-02
Processing 2024-01-03
Processing 2024-01-04
Processing 2024-01-05
Processing 2024-01-06
Processing 2024-01-07
Processing 2024-01-08
Processing 2024-01-09
Processing 2024-01-10
Processing 2024-01-11
Processing 2024-01-12
Processing 2024-01-13
Processing 2024-01-14
Processing 2024-01-15
Processing 2024-01-16
Processing 2024-01-17
Processing 2024-01-18
Processing 2024-01-19
Processing 2024-01-20
Processing 2024-01-21
Processing 2024-01-22
Processing 2024-01-23
Processing 2024-01-24
Processing 2024-01-25
Processing 2024-01-26
Processing 2024-01-27
Processing 2024-01-28
Processing 2024-01-29
Processing 2024-01-30
Processing 2024-01-31
Processing 2024-02-01
Processing 2024-02-02
Processing 2024-02-03
Processing 2024-02-04
Processing 2024-02-05
Processing 2024-02-06
Processing 2024-02-07
Processing 2024-02-08
Processing 2024-02-09
Processing 2024-02-10
Processing 2024-02-11
Processing 2024-02-12
Processing 2024-02-13
Proces

In [4]:
df

Unnamed: 0,PBLH,PBLH.1,PBLH.2,PBLH.3,PBLH.4,PBLH.5,PBLH.6,PBLH.7,PBLH.8,PBLH.9,...,PBLH.10,PBLH.11,PBLH.12,PBLH.13,PBLH.14,year,month,day,hour,site
0,62.335220,62.762482,62.694172,62.589195,62.412094,62.203041,62.243896,145.343033,661.304688,935.053772,...,96.713303,62.910095,62.747639,62.580589,62.440342,2024.0,1.0,1.0,0.0,Baghdad Airnow
1,62.263519,62.694172,62.589195,62.412094,62.203041,62.243896,145.343033,661.304688,935.053772,1073.057617,...,62.910095,62.747639,62.580589,62.440342,62.335220,2024.0,1.0,1.0,1.0,Baghdad Airnow
2,62.207428,62.589195,62.412094,62.203041,62.243896,145.343033,661.304688,935.053772,1073.057617,1145.553467,...,62.747639,62.580589,62.440342,62.335220,62.263519,2024.0,1.0,1.0,2.0,Baghdad Airnow
3,62.148602,62.412094,62.203041,62.243896,145.343033,661.304688,935.053772,1073.057617,1145.553467,1136.549194,...,62.580589,62.440342,62.335220,62.263519,62.207428,2024.0,1.0,1.0,3.0,Baghdad Airnow
4,62.080887,62.203041,62.243896,145.343033,661.304688,935.053772,1073.057617,1145.553467,1136.549194,906.478882,...,62.440342,62.335220,62.263519,62.207428,62.148602,2024.0,1.0,1.0,4.0,Baghdad Airnow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5875,64.312782,90.612373,153.561401,163.961914,259.934357,325.712677,404.232880,408.857544,871.300720,1683.571777,...,65.573708,65.196838,64.899078,64.658051,64.464760,2024.0,9.0,1.0,19.0,Uzbekistan
5876,510.946259,153.561401,163.961914,259.934357,325.712677,404.232880,408.857544,871.300720,1683.571777,1972.544312,...,65.196838,64.899078,64.658051,64.464760,64.312782,2024.0,9.0,1.0,20.0,Uzbekistan
5877,566.627747,163.961914,259.934357,325.712677,404.232880,408.857544,871.300720,1683.571777,1972.544312,2090.545898,...,64.899078,64.658051,64.464760,64.312782,510.946259,2024.0,9.0,1.0,21.0,Uzbekistan
5878,588.936096,259.934357,325.712677,404.232880,408.857544,871.300720,1683.571777,1972.544312,2090.545898,2159.520264,...,64.658051,64.464760,64.312782,510.946259,566.627747,2024.0,9.0,1.0,22.0,Uzbekistan


In [5]:
df.to_csv(time_series_dir + 'merra2_surface_flux_2024.csv', index=False)

# Merging

In [2]:
def merge_same_year(year: int):
    aerosols = pd.read_csv(time_series_dir + f'merra2_aerosols_{year}.csv')
    meteorology = pd.read_csv(time_series_dir + f'merra2_meteorology_{year}.csv')
    surface_flux = pd.read_csv(time_series_dir + f'merra2_surface_flux_{year}.csv')

    assert np.allclose(aerosols['year'], meteorology['year']) and np.allclose(aerosols['year'], surface_flux['year'])
    assert np.allclose(aerosols['month'], meteorology['month']) and np.allclose(aerosols['month'], surface_flux['month'])
    assert np.allclose(aerosols['day'], meteorology['day']) and np.allclose(aerosols['day'], surface_flux['day'])
    assert np.allclose(aerosols['hour'], meteorology['hour']) and np.allclose(aerosols['hour'], surface_flux['hour'])

    aerosols = aerosols.drop(columns=['year', 'month', 'day', 'hour', 'site'])
    meteorology = meteorology.drop(columns=['year', 'month', 'day', 'hour', 'site'])

    df = pd.concat([aerosols, meteorology, surface_flux], axis=1)
    df.to_csv(time_series_dir + f'merra2_{year}.csv', index=False)
    

In [3]:
for year in range(2016, 2025):
    print(f'Merging {year}')
    merge_same_year(year)

Merging 2016
Merging 2017
Merging 2018
Merging 2019
Merging 2020
Merging 2021
Merging 2022
Merging 2023
Merging 2024


In [4]:
df = pd.DataFrame()

for year in range(2016, 2025):
    print(f'Processing {year}')
    year_df = pd.read_csv(time_series_dir + f'merra2_{year}.csv')
    df = pd.concat([df, year_df])

Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024


In [7]:
df.to_csv(time_series_dir + 'merra2.csv', index=False)

# Joining MERRA-2 and OpenAQ

In [2]:
openaq = pd.read_csv(openaq_dir + 'openaq_integrated.csv')
openaq = openaq.rename(columns={'value': 'pm25', 'site_name': 'site'})
openaq['utc'] = pd.to_datetime(openaq['utc'])
openaq['year'] = openaq['utc'].dt.year
openaq['month'] = openaq['utc'].dt.month
openaq['day'] = openaq['utc'].dt.day
openaq['hour'] = openaq['utc'].dt.hour
openaq = openaq.drop(columns=['local', 'parameter', 'units', 'site_id', 'utc'])
openaq.head()

Unnamed: 0,pm25,site,lat,lon,year,month,day,hour
0,31.0,Baghdad Airnow,33.3128,44.3615,2019,10,28,19
1,44.0,Baghdad Airnow,33.3128,44.3615,2019,10,28,21
2,27.0,Baghdad Airnow,33.3128,44.3615,2019,10,28,22
3,20.0,Baghdad Airnow,33.3128,44.3615,2019,10,29,0
4,15.0,Baghdad Airnow,33.3128,44.3615,2019,10,29,1


In [3]:
merra2 = pd.read_csv(time_series_dir + 'merra2.csv')
merra2.head()

Unnamed: 0,DUEXTTAU,SSSMASS25,SSSMASS,OCSMASS,BCSMASS,SSEXTTAU,TOTEXTTAU,BCEXTTAU,SUEXTTAU,OCEXTTAU,...,PBLH.20,PBLH.21,PBLH.22,PBLH.23,PBLH.24,year,month,day,hour,site
0,0.098922,1.853095e-09,9.473297e-09,9.713403e-10,1.241006e-09,0.014824,0.195017,0.005523,0.070548,0.005161,...,250.842392,73.665161,62.153397,68.188454,72.003738,2016.0,1.0,2.0,0.0,Baghdad Airnow
1,0.082291,1.662556e-09,8.800271e-09,1.069566e-09,1.377885e-09,0.015745,0.188352,0.00595,0.078841,0.005528,...,73.665161,62.153397,68.188454,72.003738,70.67395,2016.0,1.0,2.0,1.0,Baghdad Airnow
2,0.07317,1.557965e-09,9.044016e-09,1.163699e-09,1.527951e-09,0.016788,0.191389,0.006491,0.088972,0.005982,...,62.153397,68.188454,72.003738,70.67395,161.395004,2016.0,1.0,2.0,2.0,Baghdad Airnow
3,0.067989,1.521585e-09,9.422365e-09,1.259195e-09,1.67438e-09,0.018282,0.199021,0.007029,0.099227,0.006491,...,68.188454,72.003738,70.67395,161.395004,188.382263,2016.0,1.0,2.0,3.0,Baghdad Airnow
4,0.063905,1.528406e-09,9.524229e-09,1.34969e-09,1.824446e-09,0.019397,0.202212,0.007398,0.104635,0.006882,...,72.003738,70.67395,161.395004,188.382263,188.617386,2016.0,1.0,2.0,4.0,Baghdad Airnow


In [5]:
merged_df = pd.merge(openaq, merra2, how='inner', on=['year', 'month', 'day', 'hour', 'site'])
merged_df.head()

Unnamed: 0,pm25,site,lat,lon,year,month,day,hour,DUEXTTAU,SSSMASS25,...,PBLH.15,PBLH.16,PBLH.17,PBLH.18,PBLH.19,PBLH.20,PBLH.21,PBLH.22,PBLH.23,PBLH.24
0,31.0,Baghdad Airnow,33.3128,44.3615,2019,10,28,19,0.1247,1.483386e-09,...,2017.950806,2173.932373,2255.919922,2058.425293,1756.417725,915.890442,1726.380737,1008.899048,166.705597,103.362801
1,44.0,Baghdad Airnow,33.3128,44.3615,2019,10,28,21,0.120642,1.51249e-09,...,2255.919922,2058.425293,1756.417725,915.890442,1726.380737,1008.899048,166.705597,103.362801,65.312035,65.157997
2,27.0,Baghdad Airnow,33.3128,44.3615,2019,10,28,22,0.118813,1.522039e-09,...,2058.425293,1756.417725,915.890442,1726.380737,1008.899048,166.705597,103.362801,65.312035,65.157997,65.02681
3,20.0,Baghdad Airnow,33.3128,44.3615,2019,10,29,0,0.117735,1.501121e-09,...,915.890442,1726.380737,1008.899048,166.705597,103.362801,65.312035,65.157997,65.02681,64.909348,64.812714
4,15.0,Baghdad Airnow,33.3128,44.3615,2019,10,29,1,0.117703,1.436547e-09,...,1726.380737,1008.899048,166.705597,103.362801,65.312035,65.157997,65.02681,64.909348,64.812714,64.73114


In [8]:
len(merged_df)

350626

In [9]:
len(openaq)

358145

In [10]:
len(merra2)

835824

In [6]:
merged_df.to_csv(time_series_dir + 'merra2_openaq.csv', index=False)

# Converting to Numpy Files

In [3]:
df = pd.read_csv(time_series_dir + 'merra2_openaq.csv')
df = df.dropna(axis=0).reset_index(drop=True)
df['site'] = pd.Categorical(df['site']).codes
df.head()

Unnamed: 0,pm25,site,lat,lon,year,month,day,hour,DUEXTTAU,SSSMASS25,...,PBLH.15,PBLH.16,PBLH.17,PBLH.18,PBLH.19,PBLH.20,PBLH.21,PBLH.22,PBLH.23,PBLH.24
0,31.0,0,33.3128,44.3615,2019,10,28,19,0.1247,1.483386e-09,...,2017.950806,2173.932373,2255.919922,2058.425293,1756.417725,915.890442,1726.380737,1008.899048,166.705597,103.362801
1,44.0,0,33.3128,44.3615,2019,10,28,21,0.120642,1.51249e-09,...,2255.919922,2058.425293,1756.417725,915.890442,1726.380737,1008.899048,166.705597,103.362801,65.312035,65.157997
2,27.0,0,33.3128,44.3615,2019,10,28,22,0.118813,1.522039e-09,...,2058.425293,1756.417725,915.890442,1726.380737,1008.899048,166.705597,103.362801,65.312035,65.157997,65.02681
3,20.0,0,33.3128,44.3615,2019,10,29,0,0.117735,1.501121e-09,...,915.890442,1726.380737,1008.899048,166.705597,103.362801,65.312035,65.157997,65.02681,64.909348,64.812714
4,15.0,0,33.3128,44.3615,2019,10,29,1,0.117703,1.436547e-09,...,1726.380737,1008.899048,166.705597,103.362801,65.312035,65.157997,65.02681,64.909348,64.812714,64.73114


In [4]:
train, val = train_test_split(df, test_size=0.1, random_state=497)
train_inputs = train.drop(columns=['pm25'])
train_labels = train['pm25']
val_inputs = val.drop(columns=['pm25'])
val_labels = val['pm25']

In [5]:
print(train_inputs.shape)
print(train_labels.shape)
print(val_inputs.shape)
print(val_labels.shape)

(273348, 657)
(273348,)
(30372, 657)
(30372,)


In [5]:
print('Saving training inputs')
np.save(time_series_dir + 'train_inputs.npy', train_inputs.to_numpy())
print('Saving training labels')
np.save(time_series_dir + 'train_labels.npy', train_labels.to_numpy())
print('Saving validation inputs')
np.save(time_series_dir + 'val_inputs.npy', val_inputs.to_numpy())
print('Saving validation labels')
np.save(time_series_dir + 'val_labels.npy', val_labels.to_numpy())

Saving training inputs
Saving training labels
Saving validation inputs
Saving validation labels


In [6]:
train_inputs.head()

Unnamed: 0,site,lat,lon,year,month,day,hour,DUEXTTAU,SSSMASS25,SSSMASS,...,PBLH.15,PBLH.16,PBLH.17,PBLH.18,PBLH.19,PBLH.20,PBLH.21,PBLH.22,PBLH.23,PBLH.24
298030,10,41.3672,69.2725,2023,12,29,18,0.149939,5.379661e-10,1.909939e-09,...,792.852234,851.338013,823.593689,1088.35376,1298.860718,1378.358887,1477.325928,1108.336426,1001.822632,881.53717
37504,1,33.298722,44.395917,2021,3,5,5,0.733432,8.533382e-10,3.666224e-09,...,89.756927,69.49958,63.582596,63.136974,62.905594,62.738747,62.637955,104.166382,731.77478,333.955109
158923,5,29.292317,48.047679,2023,11,26,19,0.310924,4.238245e-09,2.133311e-08,...,892.606628,918.396912,933.519165,889.783447,391.843018,237.542007,149.930664,150.050171,145.184814,65.33316
67304,2,26.204697,50.570833,2018,9,26,17,0.043318,1.941953e-08,1.099543e-07,...,894.836121,1316.108276,1610.626221,1673.557861,1447.994263,849.441956,469.766113,492.387115,264.613403,213.030975
125082,5,29.292317,48.047679,2019,3,27,1,0.239741,7.028133e-10,3.414248e-09,...,1893.636963,2218.629395,2028.133789,2028.118652,2027.11853,2025.622559,1825.641113,1790.164062,1789.185791,1786.73938


In [7]:
train_labels.head()

298030    200.0
37504      21.0
158923     34.0
67304      36.0
125082     38.0
Name: pm25, dtype: float64

In [8]:
train_inputs.columns[7:20]

Index(['DUEXTTAU', 'SSSMASS25', 'SSSMASS', 'OCSMASS', 'BCSMASS', 'SSEXTTAU',
       'TOTEXTTAU', 'BCEXTTAU', 'SUEXTTAU', 'OCEXTTAU', 'SO4SMASS', 'DUSMASS',
       'DUSMASS25'],
      dtype='object')

In [9]:
train_inputs.columns[332:344]

Index(['U2M', 'T500', 'PS', 'Q500', 'T10M', 'Q850', 'V2M', 'V10M', 'T850',
       'U10M', 'QV2M', 'QV10M'],
      dtype='object')

In [10]:
train_inputs.columns[632]

'PBLH'

In [13]:
t = torch.tensor(train_inputs.to_numpy()[0])
t

tensor([ 1.0000e+01,  4.1367e+01,  6.9272e+01,  2.0230e+03,  1.2000e+01,
         2.9000e+01,  1.8000e+01,  1.4994e-01,  5.3797e-10,  1.9099e-09,
         2.3659e-10,  7.6199e-11,  3.9662e-03,  1.7958e-01,  2.7017e-03,
         1.9667e-02,  3.3074e-03,  1.1133e-09,  2.4890e-07,  6.3184e-08,
         4.2961e-02,  1.7548e-10,  5.2887e-10,  4.2132e-10,  1.2352e-10,
         9.9027e-04,  7.3455e-02,  4.1342e-03,  1.7948e-02,  7.4159e-03,
         1.2262e-09,  2.6805e-08,  8.4474e-09,  6.2371e-02,  1.6581e-10,
         4.8453e-10,  4.4270e-10,  1.2540e-10,  1.3671e-03,  9.8974e-02,
         4.6884e-03,  2.2006e-02,  8.5338e-03,  1.1934e-09,  2.5189e-08,
         8.0072e-09,  9.0279e-02,  1.4956e-10,  4.2428e-10,  4.9909e-10,
         1.3159e-10,  1.9778e-03,  1.3535e-01,  5.2821e-03,  2.8021e-02,
         9.7600e-03,  1.1488e-09,  2.2919e-08,  7.2232e-09,  1.0532e-01,
         1.3881e-10,  3.9654e-10,  6.8781e-10,  1.5643e-10,  2.5255e-03,
         1.5832e-01,  5.7172e-03,  3.4167e-02,  1.0

In [24]:
reshaped = t[7:7 + 13 * 25].view(-1, 25, 13)
reshaped

tensor([[[1.4994e-01, 5.3797e-10, 1.9099e-09, 2.3659e-10, 7.6199e-11,
          3.9662e-03, 1.7958e-01, 2.7017e-03, 1.9667e-02, 3.3074e-03,
          1.1133e-09, 2.4890e-07, 6.3184e-08],
         [4.2961e-02, 1.7548e-10, 5.2887e-10, 4.2132e-10, 1.2352e-10,
          9.9027e-04, 7.3455e-02, 4.1342e-03, 1.7948e-02, 7.4159e-03,
          1.2262e-09, 2.6805e-08, 8.4474e-09],
         [6.2371e-02, 1.6581e-10, 4.8453e-10, 4.4270e-10, 1.2540e-10,
          1.3671e-03, 9.8974e-02, 4.6884e-03, 2.2006e-02, 8.5338e-03,
          1.1934e-09, 2.5189e-08, 8.0072e-09],
         [9.0279e-02, 1.4956e-10, 4.2428e-10, 4.9909e-10, 1.3159e-10,
          1.9778e-03, 1.3535e-01, 5.2821e-03, 2.8021e-02, 9.7600e-03,
          1.1488e-09, 2.2919e-08, 7.2232e-09],
         [1.0532e-01, 1.3881e-10, 3.9654e-10, 6.8781e-10, 1.5643e-10,
          2.5255e-03, 1.5832e-01, 5.7172e-03, 3.4167e-02, 1.0611e-02,
          1.1042e-09, 2.6354e-08, 7.5888e-09],
         [1.2009e-01, 1.6166e-10, 4.9249e-10, 8.6425e-10, 1.9543e

In [25]:
reshaped.shape

torch.Size([1, 25, 13])