In [24]:
# import necessary files and libraries
import pandas as pd 

In [170]:
# list out the years and stations
years = [2022,2023,2024]
stations = ['ratnapark','khumaltar','bhaktapur','kirtipur','shankhapark','pulchowk','bhaisipati']
# stations = ['ratnapark','khumaltar']

In [None]:
# Load the PM2.5 datasets and combine it by year and station-wise
try:
    yearly_pm25df = {}
    for j in stations:
        df_list = []
        for i in years:
            i_pm25df_j = pd.read_csv(f"./datasets/Pm25 record data/y_{i}_{j} PM2.5 hourlyaverage.csv")
            df_list.append(i_pm25df_j)
            print(f"{i}_pm25df_{j}")
        yearly_pm25df[j] = pd.concat(df_list, ignore_index = True)
        print(yearly_pm25df[f"{j}"])
        print("\n")
except:
    print("Something went wrong.")

In [None]:
for i in stations:
    print(f"{i}".upper())
    print(yearly_pm25df[i].head())
    print(yearly_pm25df[i].info())
    print("Missing values =====")
    print(yearly_pm25df[i].isnull().sum())
    print("\n")

In [None]:
metdf_list = []
for j in stations:
    metdf_j = pd.read_csv(f"./datasets/Meteorological parameters/metdata_{j}_2022to2024.csv")
    metdf_list.append(metdf_j)

for x in metdf_list:
        print(x)


In [175]:
# merge the dataset
stations = list(yearly_pm25df.keys())  # ['ratnapark','khumaltar']

metdf_dict = {station: metdf_list[i] for i, station in enumerate(stations)}
combined = {}

for station in stations:
    pm = yearly_pm25df[station].reset_index(drop=True)
    met = metdf_dict[station].reset_index(drop=True)

    combined[station] = pd.concat([pm, met], axis=1)

for x in combined:
    x_merged_df = combined[x]
    x_merged_df.to_csv(f"./datasets/merged_datasets/{x}_merged_df.csv")

In [10]:
# Feature selection - select the required columns
df_ratnapark = df_ratnapark[['PM2.5',	'YEAR',	'MO', 'DY', 'HR', 'PS', 'WS2M',	'WD2M',	'WS10M', 'WD10M', 'PRECTOTCORR','RH2M',	'QV2M',	'T2M']]
# df_ratnapark.to_csv("./datasets/df_ratnapark_selected_before_imputation.csv")

In [12]:
df_ratnapark

Unnamed: 0,PM2.5,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
0,75.780952,2022,1,1,0,88.03,0.47,317.6,0.77,319.2,0.0,83.08,6.70,8.85
1,56.584127,2022,1,1,1,88.00,0.40,310.0,0.64,313.7,0.0,80.28,6.48,8.86
2,49.538710,2022,1,1,2,87.96,0.26,319.6,0.46,323.9,0.0,78.29,6.27,8.74
3,47.398438,2022,1,1,3,87.93,0.28,300.3,0.46,306.1,0.0,77.32,6.02,8.32
4,42.821875,2022,1,1,4,87.91,0.29,329.0,0.51,330.4,0.0,75.66,5.80,8.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,74.526667,2024,12,31,19,87.83,0.62,256.0,1.44,253.1,0.0,92.40,8.28,10.37
8780,84.556666,2024,12,31,20,87.85,0.44,262.2,0.92,263.7,0.0,91.04,7.94,9.97
8781,105.226667,2024,12,31,21,87.85,0.45,266.2,1.00,261.4,0.0,91.15,7.68,9.46
8782,107.503333,2024,12,31,22,87.85,0.58,279.0,1.19,274.8,0.0,89.81,7.39,9.11


In [11]:
overall_mean_value = df_ratnapark['PM2.5'].mean()
overall_mean_value

43.723958348091436

In [14]:
# compute the mean value of PM2.5 by the 12 months and 24 hours in a day - 
# later used that mean value to replace missing values with the average of hours of that month
monthly_avg_hourly_pm25 = df_ratnapark.groupby(['YEAR','MO', 'HR'])['PM2.5'].mean()
# monthly_avg_hourly_pm25.to_csv('./datasets/hourlyavg_month.csv')
monthly_avg_hourly_pm25

YEAR  MO  HR
2022  1   0      24.464878
          1      21.252500
          2      20.434686
          3      18.221462
          4      15.184770
                   ...    
2024  12  19     89.562365
          20    104.385269
          21    110.148172
          22    107.567849
          23     99.938226
Name: PM2.5, Length: 864, dtype: float64

In [15]:
# Define a function that fills the missing values - filling by monthly mean hourly values
def fill_pm25(row):
    if pd.isna(row['PM2.5']):
        return monthly_avg_hourly_pm25.loc[(row['YEAR'], row['MO'], row['HR'])]
    else:
        return row['PM2.5']

df_ratnapark['PM2.5'] = df_ratnapark.apply(fill_pm25, axis=1)
# df_ratnapark.to_csv("./datasets/pm25_filled_by_mean_month_hour.csv", index=False)

In [12]:
df_ratnapark.isnull().sum()

PM2.5          5557
YEAR              0
MO                0
DY                0
HR                0
PS                0
WS2M              0
WD2M              0
WS10M             0
WD10M             0
PRECTOTCORR       0
RH2M              0
QV2M              0
T2M               0
dtype: int64

In [17]:
df_ratnapark['PM2.5'] = df_ratnapark['PM2.5'].fillna(overall_mean_value)

In [18]:
df_ratnapark.isnull().sum()

PM2.5          0
YEAR           0
MO             0
DY             0
HR             0
PS             0
WS2M           0
WD2M           0
WS10M          0
WD10M          0
PRECTOTCORR    0
RH2M           0
QV2M           0
T2M            0
dtype: int64

In [19]:
# df_ratnapark.to_csv("./datasets/ratnapark_pm25_after_imputation.csv")

In [20]:
# Finding Correlation among the parameters - Spearman, Pearson correlation
df_clean = df_ratnapark[['PM2.5',	'MO', 'DY', 'HR', 'PS', 'WS2M',	'WD2M',	'WS10M', 'WD10M', 'PRECTOTCORR','RH2M',	'QV2M',	'T2M']]
df_clean_corr = df_clean.corr()
df_clean_corr

Unnamed: 0,PM2.5,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
PM2.5,1.0,-0.1932435,-0.01015882,-0.03887949,0.400289,-0.169077,-0.05371,-0.169854,-0.051789,-0.185792,-0.411608,-0.567624,-0.35467
MO,-0.193243,1.0,0.01006662,-9.913273e-16,0.05009,-0.148291,-0.036506,-0.160631,-0.034621,0.051688,0.490143,0.42107,0.098548
DY,-0.010159,0.01006662,1.0,-9.477663000000001e-17,-0.003108,0.00112,-0.02297,-0.000555,-0.024932,0.026784,0.014002,0.012006,0.005619
HR,-0.038879,-9.913273e-16,-9.477663000000001e-17,1.0,-0.000899,0.119319,0.252091,0.13732,0.249333,0.025861,0.017635,0.089526,0.126755
PS,0.400289,0.05009048,-0.003107711,-0.0008986717,1.0,-0.155002,0.029455,-0.179142,0.028472,-0.281704,-0.20568,-0.605946,-0.590246
WS2M,-0.169077,-0.1482913,0.001119677,0.1193186,-0.155002,1.0,0.190518,0.990722,0.196076,0.031534,-0.493096,0.003066,0.573455
WD2M,-0.05371,-0.03650598,-0.02297041,0.2520915,0.029455,0.190518,1.0,0.177912,0.971214,-0.049829,-0.093824,-0.058097,0.048235
WS10M,-0.169854,-0.1606311,-0.000554529,0.1373202,-0.179142,0.990722,0.177912,1.0,0.183787,0.040918,-0.475352,0.006776,0.559621
WD10M,-0.051789,-0.03462111,-0.02493232,0.249333,0.028472,0.196076,0.971214,0.183787,1.0,-0.047093,-0.094909,-0.05579,0.052998
PRECTOTCORR,-0.185792,0.05168807,0.02678429,0.02586067,-0.281704,0.031534,-0.049829,0.040918,-0.047093,1.0,0.233966,0.330398,0.174033
