#Create the environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/ESoWC

/content/drive/My Drive/ESoWC


In [3]:
import pandas as pd
import xarray as xr

import numpy as np
import pandas as pd
from sklearn import preprocessing
import seaborn as sns

In [4]:
fn_land = 'Data/land_cover_data.nc'
fn_weather = 'Data/05_2019_weather_and_CO_for_model.nc'
fn_conc = 'Data/conc_dataset.nc'
fn_traffic = 'Data/emissions_traffic_hourly_merged.nc'

#Load datasets

##Land

In [5]:
# Open netCDF file 
land = xr.open_dataset(fn_land)
land

In [6]:
land_fixed = land.drop_vars('NO emissions') #They are already in the weather dataset
hours = np.arange(0,24,1)
land_fixed = land_fixed.expand_dims({'Hours':hours})
land_fixed = land_fixed.assign_coords(time=land_fixed.time.dt.day)
land_fixed = land_fixed.rename({'time':'Days'})
land_fixed = land_fixed.rename({'lon':'long'})
land_fixed = land_fixed.transpose('lat','long','Days','Hours')  
land_fixed

##Weather

In [7]:
weather = xr.open_dataset(fn_weather)
weather

In [8]:
#These variables are also in another dataset
weather_fixed = weather.drop_vars('tcw')
weather_fixed = weather_fixed.rename({'longitude':'long'})
weather_fixed = weather_fixed.rename({'latitude':'lat'})
weather_fixed = weather_fixed.transpose('lat','long','Days','Hours')  
weather_fixed

##Conc

In [9]:
conc = xr.open_dataset(fn_conc)
conc

In [10]:
conc_fidex=conc
hour_bins = np.arange(0,24,1)
conc_fidex = conc_fidex.interp(Hours=hour_bins, method="linear")
conc_fidex = conc_fidex.transpose('lat','long','Days','Hours')  
conc_fidex

#Traffic

In [11]:
traffic = xr.open_dataset(fn_traffic)
traffic

In [12]:
traffic_fixed=traffic.drop_vars('emissions')
lat_bins = np.arange(43,51.25,0.25)
lon_bins = np.arange(4,12.25,0.25)
traffic_fixed = traffic_fixed.sortby(['latitude','longitude','hour'])  
traffic_fixed = traffic_fixed.interp(latitude=lat_bins, longitude=lon_bins, method="linear")
days = np.arange(1,32,1)
traffic_fixed=traffic_fixed.expand_dims({'Days':days})
traffic_fixed=traffic_fixed.rename({'hour':'Hours'})
traffic_fixed=traffic_fixed.rename({'longitude':'long'})
traffic_fixed=traffic_fixed.rename({'latitude':'lat'})
traffic_fixed=traffic_fixed.transpose('lat','long','Days','Hours')  
traffic_fixed

#Merge

In [13]:
tot_dataset = weather_fixed.merge(land_fixed)
tot_dataset = tot_dataset.merge(conc_fidex)
tot_dataset = tot_dataset.merge(traffic_fixed)

tot_dataset

In [14]:
df = tot_dataset.to_dataframe()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,EMISSIONS_2019,u10,v10,hcc,lcc,tcc,cvl,cvh,sp,tmp,sp_hum,tot_wind,tmp_shift_8,tot_wind_shift_12,height,built,tcco,tcno2,tc_no,gtco3,tcwv,traffic
Unnamed: 0_level_1,Hours,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,0,43.0,4.0,,,,,,,,,,,,,,,0.0,0.0,0.000954,3e-06,9.295036e-10,0.007882,14.479529,
1,0,43.0,4.25,,,,,,,,,,,,,,,0.0,0.0,0.000954,3e-06,3.445696e-09,0.007854,14.662142,
1,0,43.0,4.5,,,,,,,,,,,,,,,0.0,0.0,0.000954,4e-06,5.961889e-09,0.007826,14.844755,
1,0,43.0,4.75,,,,,,,,,,,,,,,0.0,0.0,0.000954,4e-06,8.478082e-09,0.007798,15.027369,
1,0,43.0,5.0,,,,,,,,,,,,,,,0.0,0.0,0.000953,5e-06,3.749665e-08,0.007779,14.95413,


In [15]:
tot_dataset.to_netcdf('dataset_complete_for_model_CO.nc', 'w', 'NETCDF4')

#Check

In [16]:
df_clean = df.dropna()
df_clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,EMISSIONS_2019,u10,v10,hcc,lcc,tcc,cvl,cvh,sp,tmp,sp_hum,tot_wind,tmp_shift_8,tot_wind_shift_12,height,built,tcco,tcno2,tc_no,gtco3,tcwv,traffic
Unnamed: 0_level_1,Hours,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,12,44.5,8.5,1.937331e-10,1.151261,1.565244,0.005661,-2.980232e-08,0.025514,0.233412,0.659459,98100.914062,265.990814,4e-06,1.943036,262.449432,2.532905,0.465535,0.012333,0.000901,2e-06,8.727168e-07,0.007997,12.534024,0.0
1,13,44.5,8.5,1.971024e-10,1.192059,1.865819,0.003774,0.0006510417,0.018271,0.233412,0.659459,98062.578125,265.965332,4e-06,2.21411,261.694916,2.347736,0.465535,0.012333,0.000897,2e-06,8.526965e-07,0.007977,12.32318,85.445255
1,14,44.5,8.5,1.920485e-10,1.232856,2.166394,0.001887,0.001302113,0.011028,0.233412,0.659459,98024.242188,265.93988,4e-06,2.492629,260.940369,2.190543,0.465535,0.012333,0.000893,2e-06,8.326761e-07,0.007957,12.112337,84.904842
1,15,44.5,8.5,1.954177e-10,1.273654,2.46697,0.0,0.001953185,0.003784,0.233412,0.659459,97985.90625,265.914398,4e-06,2.776352,261.622223,2.067715,0.465535,0.012333,0.00089,2e-06,8.126558e-07,0.007937,11.901493,85.746872
1,16,44.5,8.5,2.190026e-10,1.036139,2.401013,0.001556,0.001668344,0.006104,0.233412,0.659459,97972.820312,265.880432,4e-06,2.615042,262.304108,1.826341,0.465535,0.012333,0.000889,2e-06,7.160276e-07,0.007932,11.753887,77.719917


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1643496 entries, (1, 0, 43.0, 4.0) to (31, 23, 51.0, 12.0)
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   EMISSIONS_2019     162000 non-null  float32
 1   u10                166950 non-null  float32
 2   v10                166950 non-null  float32
 3   hcc                166950 non-null  float32
 4   lcc                166950 non-null  float32
 5   tcc                166950 non-null  float32
 6   cvl                166950 non-null  float32
 7   cvh                166950 non-null  float32
 8   sp                 166950 non-null  float32
 9   tmp                166950 non-null  float32
 10  sp_hum             166950 non-null  float32
 11  tot_wind           166950 non-null  float32
 12  tmp_shift_8        166942 non-null  float32
 13  tot_wind_shift_12  166938 non-null  float32
 14  height             760320 non-null  float32
 15  built              7

In [18]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 648 entries, (1, 12, 44.5, 8.5) to (30, 21, 44.5, 8.5)
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   EMISSIONS_2019     648 non-null    float32
 1   u10                648 non-null    float32
 2   v10                648 non-null    float32
 3   hcc                648 non-null    float32
 4   lcc                648 non-null    float32
 5   tcc                648 non-null    float32
 6   cvl                648 non-null    float32
 7   cvh                648 non-null    float32
 8   sp                 648 non-null    float32
 9   tmp                648 non-null    float32
 10  sp_hum             648 non-null    float32
 11  tot_wind           648 non-null    float32
 12  tmp_shift_8        648 non-null    float32
 13  tot_wind_shift_12  648 non-null    float32
 14  height             648 non-null    float32
 15  built              648 non-null    float3