Data preprocessing for Random Forest Regressor model:
drive link:- https://drive.google.com/drive/folders/1N_FNnUTrOM0vqP8BapjFRS8llM01wg0r?usp=sharing
Download the data from the given drive folder to processs the files

In [None]:
!pip install xarray
!pip install pandas


In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import os



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Creating a era5 file with features from year 2002-2017

In [None]:

input_file_path = '/content/drive/My Drive/Pm25 Project/era5file.nc'
era5 = xr.open_dataset(input_file_path)
era5_df = era5.sel(time=(era5['time'].dt.year >= 2002) & (era5['time'].dt.year <= 2017))
era5.close()
era5_df.close()


  values_as_series = pd.Series(values.ravel(), copy=False)


Creating a era5 file for future features from year 2017-2022

In [None]:
input_file_path = '/content/drive/My Drive/Pm25 Project/era5file.nc'
era5 = xr.open_dataset(input_file_path)
feature_df = era5.sel(time=(era5['time'].dt.year >= 2018) & (era5['time'].dt.year <= 2022))
era5.close()
feature_df.close()

Adding a month-year column

In [None]:
era5_df['month_year'] = era5_df['time'].dt.year * 100 + era5_df['time'].dt.month

feature_df['month_year'] = feature_df['time'].dt.year * 100 + feature_df['time'].dt.month

  values_as_series = pd.Series(values.ravel(), copy=False)


Preparing combined dataframe for all monthly files from 2002-2017

In [None]:

data_dir = '/content/drive/My Drive/Pm25 Project/Monthly2002-17'

preprocessed_data = []
month_year_int = []


for file_name in os.listdir(data_dir):
    if file_name.endswith('.nc'):
        try:

            pm25_data = xr.open_dataset(os.path.join(data_dir, file_name))

            preprocessed_data.append(pm25_data)

            print(f"Processed file: {file_name}")
            month_year_str = file_name.split('.')[3]
            month, year = month_year_str.split('-')  # Splitting by '-' to separate month and year
            month_year_int.append(int(year))


combined_data = xr.concat(preprocessed_data, dim='time')
combined_data['month_year'] = ('time', month_year_int)

Processed file: V5GL04.HybridPM25c_0p10.Global.202101-202101.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202102-202102.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202103-202103.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202104-202104.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202105-202105.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202106-202106.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202107-202107.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202108-202108.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202109-202109.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202110-202110.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202111-202111.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202112-202112.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.202001-202001.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.201902-201902.nc
Processed file: V5GL04.HybridPM25c_0p10.Global.201901-201901.nc
Processed file: V5GL04.HybridPM25c_0p10.

Changing the resolution of all dataframes

In [None]:
combined_data = combined_data.interp(lon=np.arange(combined_data.lon.min(), combined_data.lon.max(), 0.5),
                                lat=np.arange(combined_data.lat.min(),combined_data.lat.max(), 0.5),
                                method='linear')
era5_df = era5_df.interp(longitude=np.arange(era5_df.longitude.min(), era5_df.longitude.max(), 0.5),
                                latitude=np.arange(era5_df.latitude.min(), era5_df.latitude.max(), 0.5),
                                method='linear')
feature_df = feature_df.interp(longitude=np.arange(feature_df.longitude.min(), feature_df.longitude.max(), 0.5),
                                latitude=np.arange(feature_df.latitude.min(), feature_df.latitude.max(), 0.5),
                                method='linear')
era5_short_df = era5_df.to_dataframe().reset_index()


Rounding the values of lat, lon to 0.5, 0.0, as per the analysis

In [None]:
combined_data_df = combined_data.to_dataframe().reset_index()
combined_data_df[['lat', 'lon']] = combined_data_df[['lat', 'lon']].round(1)
combined_data_df['lat'] = np.around(combined_data_df['lat'] * 2) / 2
combined_data_df['lon'] = np.around(combined_data_df['lon'] * 2) / 2

combined_data_df

Unnamed: 0,time,lat,lon,GWRPM25,month_year
0,0,-55.0,-180.0,,202101
1,0,-55.0,-179.5,,202101
2,0,-55.0,-179.0,,202101
3,0,-55.0,-178.5,,202101
4,0,-55.0,-178.0,,202101
...,...,...,...,...,...
6341575,35,67.0,177.0,12.699992,202006
6341576,35,67.0,177.5,12.800000,202006
6341577,35,67.0,178.0,12.300000,202006
6341578,35,67.0,178.5,11.800000,202006


In [None]:
combined_data_df.rename(columns={'lat': 'latitude', 'lon': 'longitude'}, inplace=True)

Merge both files, clean the data set and add column for wind_speed in both data frames

In [None]:
combined_df = pd.merge(combined_data_df, era5_short_df, on=['latitude', 'longitude', 'month_year'])
combined_df = combined_df.dropna(subset=['GWRPM25'])
combined_df['wind_speed'] = np.sqrt(combined_df['u10']**2 + combined_df['v10']**2)
feature_df['wind_speed'] = np.sqrt(feature_df['u10']**2 + feature_df['v10']**2)

In [None]:
combined_df.to_csv('/content/drive/My Drive/Pm25 Project/combined_2002-2017.csv')
feature_df.to_csv('/content/drive/My Drive/Pm25 Project/feature_df.csv')
