In [None]:
import matplotlib.pyplot as plt
from scipy.stats import entropy
import pygeohash as pgh
from osgeo import gdal
import pandas as pd
import numpy as np
import time
import os

In [None]:
samples_path = r".\generated_samples_0.6"
refrence_path = r".\AQ_NYC_orignal_1.tiff"

In [None]:
def encode_geohash(latitude, longitude, key=key1):
    return pgh.encode(latitude, longitude, key)

In [None]:
def DataframeGenerator(image1):
  band1_values = image1.GetRasterBand(1).ReadAsArray()
  band2_values = image1.GetRasterBand(2).ReadAsArray()
  band3_values = image1.GetRasterBand(3).ReadAsArray()

  rows, cols = band1_values.shape

  latitude = []
  longitude = []
  pm25_values = []

  for row in range(rows):
      for col in range(cols):
          latitude.append(band2_values[row, col])
          longitude.append(band3_values[row, col])
          pm25_values.append(band1_values[row, col])

  df = pd.DataFrame({'latitude': latitude, 'longitude': longitude, 'pm2.5 values': pm25_values})

  df_filtered = df[(df != 0).all(axis=1)]

  df_filtered.reset_index(drop=True, inplace=True)

  df_filtered['Geohash'] = df_filtered.apply(lambda row: encode_geohash(row['latitude'], row['longitude']), axis=1)

  df_filtered = df_filtered.drop_duplicates(subset='Geohash')
  df_filtered.reset_index(drop=True, inplace=True)

  return df_filtered

In [None]:
def Geohash_Mismatch_Fixer(df1, df2):
  geohashes_df1 = set(df1['Geohash'])
  geohashes_df2 = set(df2['Geohash'])

  missing_in_df1 = geohashes_df2 - geohashes_df1
  missing_in_df2 = geohashes_df1 - geohashes_df2

  missing_df1 = pd.DataFrame({'Geohash': list(missing_in_df1), 'pm2.5 values': [0] * len(missing_in_df1), 'latitude': [0] * len(missing_in_df1), 'longitude': [0] * len(missing_in_df1)})
  df1 = pd.concat([df1, missing_df1], ignore_index=True)

  missing_df2 = pd.DataFrame({'Geohash': list(missing_in_df2), 'pm2.5 values': [0] * len(missing_in_df2), 'latitude': [0] * len(missing_in_df2), 'longitude': [0] * len(missing_in_df2)})
  df2 = pd.concat([df2, missing_df2], ignore_index=True)

  return df1, df2

In [None]:
def RMSE_Calculator(df1, df2):
  merged_df = pd.merge(df1, df2, on='Geohash', suffixes=('_1', '_2'))

  merged_df['squared_diff'] = (merged_df['pm2.5 values_1'] - merged_df['pm2.5 values_2'])**2

  mse = merged_df['squared_diff'].mean()

  rmse = np.sqrt(mse)

  return rmse

In [None]:
def MAPE_Calculator(df1, df2):
    merged_df = pd.merge(df1, df2, on='Geohash', suffixes=('_1', '_2'))
    merged_df['absolute_percentage_diff'] = np.abs(merged_df['pm2.5 values_1'] - merged_df['pm2.5 values_2']) / merged_df['pm2.5 values_2']
    mape = merged_df['absolute_percentage_diff'].mean() * 100
    return mape


In [None]:
def Jensen_Shannon_Divergence_Calculator(df1, df2):
    merged_df = pd.merge(df1, df2, on='Geohash', suffixes=('_1', '_2'))
    p = (merged_df['pm2.5 values_1'] + merged_df['pm2.5 values_2']) / 2
    jsd = (entropy(merged_df['pm2.5 values_1'], p) + entropy(merged_df['pm2.5 values_2'], p)) / 2
    return jsd

In [None]:
def Kullback_Leibler_Divergence_Calculator(df1, df2):
    merged_df = pd.merge(df1, df2, on='Geohash', suffixes=('_1', '_2'))
    kl_div = entropy(merged_df['pm2.5 values_1'], merged_df['pm2.5 values_2'])
    return kl_div

In [None]:
refrence_image = gdal.Open(refrence_path)

image1_pm25_values_layer4 = refrence_image.GetRasterBand(4).ReadAsArray()
non_zero_values1 = image1_pm25_values_layer4[image1_pm25_values_layer4 != 0]
key1 = min(np.unique(non_zero_values1))

refrence_df = DataframeGenerator(refrence_image)

In [None]:
RMSE_df = pd.DataFrame(columns=['sample_name', 'RMSE'])

start_time = time.time()
for sample_name in os.listdir(samples_path):
    sample_path = os.path.join(samples_path, sample_name)
    sample_image = gdal.Open(sample_path)

    sample_df = DataframeGenerator(sample_image)
    
    sample_df, refrence_df = Geohash_Mismatch_Fixer(sample_df, refrence_df)
    
    RMSE_Value = RMSE_Calculator(sample_df, refrence_df)
    
    temp_df = pd.DataFrame({'sample_name': [sample_name], 'RMSE': [RMSE_Value]})
    RMSE_df = pd.concat([RMSE_df, temp_df], ignore_index=True)
    
end_time = time.time()
execution_time = end_time - start_time
RMSE_df

In [None]:
MAPE_df = pd.DataFrame(columns=['sample_name', 'MAPE'])

start_time = time.time()
for sample_name in os.listdir(samples_path):
    sample_path = os.path.join(samples_path, sample_name)
    sample_image = gdal.Open(sample_path)

    sample_df = DataframeGenerator(sample_image)
    
    sample_df, refrence_df = Geohash_Mismatch_Fixer(sample_df, refrence_df)
    
    MAPE_Value = MAPE_Calculator(sample_df, refrence_df)
    
    temp_df = pd.DataFrame({'sample_name': [sample_name], 'MAPE': [MAPE_Value]})
    MAPE_df = pd.concat([MAPE_df, temp_df], ignore_index=True)
    
end_time = time.time()
MAPE_execution_time = end_time - start_time
MAPE_df

In [None]:
Jensen_Shannon_Divergence_df = pd.DataFrame(columns=['sample_name', 'Jensen_Shannon_Divergence'])

start_time = time.time()
for sample_name in os.listdir(samples_path):
    sample_path = os.path.join(samples_path, sample_name)
    sample_image = gdal.Open(sample_path)

    sample_df = DataframeGenerator(sample_image)
    
    sample_df, refrence_df = Geohash_Mismatch_Fixer(sample_df, refrence_df)
    
    Jensen_Shannon_Divergence_Value = Jensen_Shannon_Divergence_Calculator(sample_df, refrence_df)
    
    temp_df = pd.DataFrame({'sample_name': [sample_name], 'Jensen_Shannon_Divergence': [Jensen_Shannon_Divergence_Value]})
    Jensen_Shannon_Divergence_df = pd.concat([Jensen_Shannon_Divergence_df, temp_df], ignore_index=True)
    
end_time = time.time()
Jensen_Shannon_Divergence_execution_time = end_time - start_time
Jensen_Shannon_Divergence_df

In [None]:
Kullback_Leibler_Divergence_df = pd.DataFrame(columns=['sample_name', 'Kullback_Leibler_Divergence'])

start_time = time.time()
for sample_name in os.listdir(samples_path):
    sample_path = os.path.join(samples_path, sample_name)
    sample_image = gdal.Open(sample_path)

    sample_df = DataframeGenerator(sample_image)
    
    sample_df, refrence_df = Geohash_Mismatch_Fixer(sample_df, refrence_df)
    
    Kullback_Leibler_Divergence_Value = Kullback_Leibler_Divergence_Calculator(sample_df, refrence_df)
    
    temp_df = pd.DataFrame({'sample_name': [sample_name], 'Kullback_Leibler_Divergence': [Kullback_Leibler_Divergence_Value]})
    Kullback_Leibler_Divergence_df = pd.concat([Kullback_Leibler_Divergence_df, temp_df], ignore_index=True)
    
end_time = time.time()
Kullback_Leibler_Divergence_Value_execution_time = end_time - start_time
Kullback_Leibler_Divergence_df

In [None]:
min_RMSE = RMSE_df.loc[RMSE_df['RMSE'].idxmin()]
min_MAPE = MAPE_df.loc[MAPE_df['MAPE'].idxmin()]
min_Jensen_Shannon_Divergence = Jensen_Shannon_Divergence_df.loc[Jensen_Shannon_Divergence_df['Jensen_Shannon_Divergence'].idxmin()]
min_Kullback_Leibler_Divergence = Kullback_Leibler_Divergence_df.loc[Kullback_Leibler_Divergence_df['Kullback_Leibler_Divergence'].idxmin()]

print(min_RMSE)
print(min_MAPE)
print(min_Jensen_Shannon_Divergence)
print(min_Kullback_Leibler_Divergence)

In [None]:
file_path = 'comparing_analysis_0.6.csv'
data = {
    'Metric': ['RMSE', 'MAPE', 'Jensen_Shannon_Divergence', 'Kullback_Leibler_Divergence'],
    'Value': [min_RMSE['RMSE'], min_MAPE['MAPE'], min_Jensen_Shannon_Divergence['Jensen_Shannon_Divergence'], min_Kullback_Leibler_Divergence['Kullback_Leibler_Divergence']],
    'sample_number': [min_RMSE['sample_name'], min_MAPE['sample_name'], min_Jensen_Shannon_Divergence['sample_name'], min_Kullback_Leibler_Divergence['sample_name']],
    'comparing_time_in_second': [execution_time, MAPE_execution_time, Jensen_Shannon_Divergence_execution_time, Kullback_Leibler_Divergence_Value_execution_time]
}

df = pd.DataFrame(data)

df.to_csv(file_path , index=False)

df