<a href="https://colab.research.google.com/github/Bibhash123/Estimation-of-NO2-Concentration-for-Sparse-Data/blob/main/Data-Preparation/Dataset_Preparation_GRSL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Creation Pipeline

In [None]:
from IPython.display import clear_output
!pip uninstall -y shapely
!pip uninstall -y cartopy
!pip install shapely cartopy --no-binary shapely --no-binary cartopy
clear_output(wait=False)

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm

In [None]:
!git clone https://github.com/HSG-AIML/NO2-dataset.git
!cp "/content/NO2-dataset/no2_dataset.tar.gz" "/content/"
!tar -xvf "/content/no2_dataset.tar.gz" -C "/content/"
!rm -r "NO2-dataset"
!rm "/content/no2_dataset.tar.gz"

Cloning into 'NO2-dataset'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 42 (delta 17), reused 25 (delta 8), pack-reused 5[K
Unpacking objects: 100% (42/42), done.
no2_dataset.csv


In [None]:
no2_data = pd.read_csv("no2_dataset.csv")
no2_data.head()

Unnamed: 0,AirQualityStation,SatelliteDatetime,TroposphericNO2ColumnNumberDensity,SatelliteLongitude,SatelliteLatitude,Source,geometry,SurfaceConcentration,SurfaceDatetime,Temperature,Precipitation,Countrycode,Altitude,AreaType,StationType,Weekday,Hour,Month,SurfaceLongitude,SurfaceLatitude,PopulationDensity,SensingTimeDiff,Lockdown
0,STA.DE_DEBW087,2018-10-17 11:35:23.253999949,5.7e-05,9.25,48.35,S5P_OFFL_L2__NO2____20181017T103438_20181017T1...,POINT (9.250000000000002 48.35),3.7,2018-10-17 12:00:00+01:00,11.62,0.0,DE,798.0,rural-regional,background,2,11,10,9.207639,48.345778,262.1,2123,
1,STA.DE_DEBW087,2018-10-17 13:16:52.555000067,0.000105,9.25,48.35,S5P_OFFL_L2__NO2____20181017T121608_20181017T1...,POINT (9.250000000000002 48.35),3.5,2018-10-17 13:00:00+01:00,11.52,0.0,DE,798.0,rural-regional,background,2,13,10,9.207639,48.345778,262.1,4612,
2,STA.DE_DEBW087,2018-10-18 11:16:21.141000032,8.5e-05,9.25,48.35,S5P_OFFL_L2__NO2____20181018T101530_20181018T1...,POINT (9.250000000000002 48.35),6.5,2018-10-18 11:00:00+01:00,11.52,0.0,DE,798.0,rural-regional,background,3,11,10,9.207639,48.345778,262.1,4581,
3,STA.DE_DEBW087,2018-10-18 12:57:50.681999922,7.5e-05,9.25,48.35,S5P_OFFL_L2__NO2____20181018T115659_20181018T1...,POINT (9.250000000000002 48.35),3.7,2018-10-18 13:00:00+01:00,9.139999,0.0,DE,798.0,rural-regional,background,3,12,10,9.207639,48.345778,262.1,3470,
4,STA.DE_DEBW087,2018-10-19 10:57:21.230000019,6.2e-05,9.25,48.35,S5P_OFFL_L2__NO2____20181019T095621_20181019T1...,POINT (9.250000000000002 48.35),8.0,2018-10-19 11:00:00+01:00,9.139999,0.0,DE,798.0,rural-regional,background,4,10,10,9.207639,48.345778,262.1,3441,


In [None]:
min(no2_data["SatelliteDatetime"]),max(no2_data["SatelliteDatetime"])

('2018-10-17 11:35:23.253999949', '2020-11-30 11:03:06.071000099')

In [None]:
min(no2_data["SurfaceDatetime"]),max(no2_data["SurfaceDatetime"])

('2018-10-17 11:00:00+01:00', '2020-11-30 14:00:00+01:00')

In [None]:
print(min(no2_data["SatelliteLongitude"]),min(no2_data["SatelliteLatitude"]))
print(max(no2_data["SatelliteLongitude"]),max(no2_data["SatelliteLatitude"]))

-9.45 35.05
39.65000000000001 69.65


In [None]:
def haversine(lo,la,lon,lat):
  R=6371000#radius of the earth in meters
  lat = float(lat)
  lon = float(lon)
  lat1=np.radians(la)
  lat2=np.radians(lat)
  delta_lat=np.radians(lat-la)
  delta_lon=np.radians(lon-lo)
  a=(np.sin(delta_lat/2))*(np.sin(delta_lat/2))+(np.cos(lat1))*(np.cos(lat2))*(np.sin(delta_lon/2))*(np.sin(delta_lon/2))
  c=2*np.arcsin(np.sqrt(a))
  d=R*c
  return d

def getNearest(lon,lat,df):
  j = -1
  d = 1e15
  for i in range(df.shape[0]):
    distance = haversine(lon,lat,df["SatelliteLongitude"].iloc[i],df["SatelliteLatitude"].iloc[i])
    if distance<d:
      d = distance
      j = i
  return j,d

In [None]:
def getPatches(sat_inp, grnd_inp, output_shape):
  for i in range(output_shape[0],inp.shape[0]+output_shape[0],output_shape[0]):
    for j in range(output_shape[1],inp.shape[1]+output_shape[1],output_shape[1]):
      yield sat_inp[i-output_shape[0]:i,j-output_shape[1]:j],grnd_inp[i-output_shape[0]:i,j-output_shape[1]:j]

def prepareData(df,resolution = 0.05,savedir = "/content/Data/",output_shape = (49,67),patch_no = (1,1), num_days = 100):
  if os.path.isdir(savedir):
    !rm -r $savedir
  os.mkdir(savedir)
  os.mkdir(os.path.join(savedir,"Satellite"))
  os.mkdir(os.path.join(savedir,"Ground"))

  min_lat,max_lat = min(df["SatelliteLatitude"]),max(df["SatelliteLatitude"])
  min_lon,max_lon = min(df["SatelliteLongitude"]),max(df["SatelliteLongitude"])
  cols,rows = (int((max_lon-min_lon)//resolution), int((max_lat-min_lat)//resolution))
  df["SatelliteDatetime"] = df["SatelliteDatetime"].apply(lambda x: x.split(" ")[0])
  df.sort_values(by="SatelliteDatetime",inplace=True)

  files = pd.DataFrame(columns=["Date","Sat_file","Ground_file"])
  f_invtry = []
  
  for date in tqdm(sorted(df["SatelliteDatetime"].unique()[:num_days]),"Creating Dataset: "):
    one_date_df = df[df["SatelliteDatetime"]==date]
    aggregate_df = one_date_df.groupby("AirQualityStation")[["SatelliteLatitude","SatelliteLongitude","SurfaceLongitude",
                                                             "SurfaceLatitude","SurfaceConcentration",
                                                             "TroposphericNO2ColumnNumberDensity"]].mean().reset_index()
    fname = f"{date}.npy"
    x,y = output_shape

    all_matrix_sat = np.zeros(output_shape)
    all_matrix_grnd = np.zeros(output_shape)
    n_patch = 14*(patch_no[0]-1) + patch_no[1]

    latitudes = list(np.linspace(min_lat,max_lat+resolution,rows))[(patch_no[0]-1)*x:patch_no[0]*x]
    longitudes = list(np.linspace(min_lon,max_lon+resolution,cols))[(patch_no[1]-1)*y:patch_no[1]*y]
    for i,lat in enumerate(tqdm(latitudes,leave=False)):
      for j,lon in enumerate(longitudes):
        index,dist = getNearest(lon,lat,aggregate_df)
        all_matrix_sat[i,j] = aggregate_df["TroposphericNO2ColumnNumberDensity"].iloc[index]
        all_matrix_grnd[i,j] = aggregate_df["SurfaceConcentration"].iloc[index]
    
    np.save(os.path.join(savedir,"Satellite",f"sat_{n_patch}_{fname}"),all_matrix_sat)
    np.save(os.path.join(savedir,"Ground",f"grnd_{n_patch}_{fname}"),all_matrix_grnd)
    f_invtry.append([date,f"sat_{n_patch}_{fname}",f"grnd_{n_patch}_{fname}"])
    n_patch += 1

  files = files.append(pd.DataFrame(np.array(f_invtry),columns=["Date","Sat_file","Ground_file"]),ignore_index=True)
  return files

In [None]:
files = prepareData(no2_data,resolution = 0.05,savedir = "/content/Data/",output_shape = (49,67),patch_no = (1,1))
files.to_csv("files.csv",index = False)

# Data Merging

In [None]:
# done till patch no 38 i.e (3,10)
from IPython.display import clear_output
import pandas as pd
import os
from google.colab import files
_ = files.upload()

if os.path.isdir("/content/Dataset/"):
  !rm -r "/content/Dataset/"
os.mkdir("/content/Dataset/")

!pip install -q kaggle
!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d Bibhash123/grsldata
!unzip -q grsldata.zip -d "/content/Dataset/"

files = pd.read_csv("/content/Dataset/files.csv")
!rm -r grsldata.zip
for i in range(1,5,1):
  !kaggle datasets download -d Bibhash123/grsl-$i
  fname = f"grsl-{i}.zip"
  !unzip -q $fname -d /content/
  temp = pd.read_csv("/content/files.csv")
  files = files.append(temp,ignore_index = True)
  !rm -r $fname
  !cp -r "/content/Data/Ground/." -d "/content/Dataset/Ground"
  !cp -r "/content/Data/Satellite/." -d "/content/Dataset/Satellite"
  !rm -r "/content/files.csv"
  !rm -r "/content/no2_dataset.csv"
  !rm -r "/content/Data"
files.to_csv("/content/Dataset/files.csv",index=False)
clear_output(wait=False)

In [None]:
!kaggle datasets init -p "/content/Dataset"
_  = input("Edit metadata file and press any key to continue")
!kaggle datasets version -p "/content/Dataset" --dir-mode tar -m "initial release"

Data package template written to: /content/Dataset/dataset-metadata.json
Edit metadata file and press any key to continuec
Starting upload for file files.csv
100% 205k/205k [00:02<00:00, 70.7kB/s]
Upload successful: files.csv (205KB)
Starting upload for file Ground.tar
100% 95.5M/95.5M [00:03<00:00, 25.9MB/s]
Upload successful: Ground.tar (95MB)
Starting upload for file Satellite.tar
100% 95.5M/95.5M [00:05<00:00, 19.9MB/s]
Upload successful: Satellite.tar (95MB)
Dataset version is being created. Please check progress at https://www.kaggle.com/bibhash123/grsldata


In [None]:
from google.colab import files
_ = files.upload()
!pip install -q kaggle
!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d Bibhash123/grsldata
!unzip -q grsldata.zip -d "/content/Dataset/"
!rm -r grsldata.zip


In [None]:
import pandas as pd
import numpy as np

files = pd.read_csv("/content/Dataset/files.csv")

In [None]:
files["patch_no"] = files["Sat_file"].apply(lambda x: x.split("_")[1])

In [None]:
files["patch_no"] = files["patch_no"].astype(int)

In [None]:
files.drop_duplicates().shape

(3690, 4)

In [None]:
files.drop_duplicates(inplace=True)

In [None]:
!kaggle datasets download -d "bibhash123/grsl-5"
!unzip grsl-5.zip -d "/content/temp"
!rm -r grsl-5.zip
temp = pd.read_csv("/content/temp/files.csv")
files = files.append(temp,ignore_index = True)

In [None]:
files

Unnamed: 0,Date,Sat_file,Ground_file
0,2018-10-17,sat_1_2018-10-17.npy,grnd_1_2018-10-17.npy
1,2018-10-18,sat_1_2018-10-18.npy,grnd_1_2018-10-18.npy
2,2018-10-19,sat_1_2018-10-19.npy,grnd_1_2018-10-19.npy
3,2018-10-20,sat_1_2018-10-20.npy,grnd_1_2018-10-20.npy
4,2018-10-21,sat_1_2018-10-21.npy,grnd_1_2018-10-21.npy
...,...,...,...
3775,2019-01-10,sat_21_2019-01-10.npy,grnd_21_2019-01-10.npy
3776,2019-01-11,sat_21_2019-01-11.npy,grnd_21_2019-01-11.npy
3777,2019-01-12,sat_21_2019-01-12.npy,grnd_21_2019-01-12.npy
3778,2019-01-13,sat_21_2019-01-13.npy,grnd_21_2019-01-13.npy


In [None]:
!cp -r "/content/temp/Data/Ground/." "/content/Dataset/Ground/"
!cp -r "/content/temp/Data/Satellite/." "/content/Dataset/Satellite/"

In [None]:
files.to_csv("/content/Dataset/files.csv",index=False)

In [None]:
!kaggle datasets init -p "/content/Dataset"

Data package template written to: /content/Dataset/dataset-metadata.json


In [None]:
!kaggle datasets version -p "/content/Dataset/" -m "rectified duplicates" --dir-mode tar

Starting upload for file files.csv
100% 205k/205k [00:01<00:00, 106kB/s]
Upload successful: files.csv (205KB)
Starting upload for file Ground.tar
98.4MB [00:34, 3.00MB/s]                
Upload successful: Ground.tar (98MB)
Starting upload for file Satellite.tar
99.6MB [01:39, 1.05MB/s]                
Upload successful: Satellite.tar (98MB)
Dataset version is being created. Please check progress at https://www.kaggle.com/bibhash123/grsldata
