In [96]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [97]:
from osgeo import gdal
import numpy as np
image1_path = '/content/drive/My Drive/AQ_latest.tiff'
image2_path = '/content/drive/My Drive/AQ_latest2.tiff'

In [98]:
image1 = gdal.Open(image1_path)
image2 = gdal.Open(image2_path)

image1_pm25_values_layer1 = image1.GetRasterBand(1).ReadAsArray()
image1_pm25_values_layer4 = image1.GetRasterBand(4).ReadAsArray()

image2_pm25_values_layer1 = image2.GetRasterBand(1).ReadAsArray()
image2_pm25_values_layer4 = image2.GetRasterBand(4).ReadAsArray()

In [99]:
print("Pm average max:", np.max(image1_pm25_values_layer1))
print("Pm average min:", np.min(image1_pm25_values_layer1[image1_pm25_values_layer1 != 0]))

Pm average max: 12.49835649
Pm average min: 2.061713825


In [100]:
non_zero_values1 = image1_pm25_values_layer4[image1_pm25_values_layer4 != 0]
non_zero_values2 = image2_pm25_values_layer4[image2_pm25_values_layer4 != 0]

key1 = min(np.unique(non_zero_values1))
key2 = min(np.unique(non_zero_values2))

print("Key 1:", key1)
print("Key 2:", key2)

Key 1: 7.0
Key 2: 7.0


In [101]:
lat_values1 = image1_pm25_values_layer1 = image1.GetRasterBand(2).ReadAsArray()
long_values1 = image1_pm25_values_layer1 = image1.GetRasterBand(3).ReadAsArray()

lat1 = lat_values1[lat_values1 != 0]
long1 = long_values1[long_values1 != 0]

print("Latitude 1:", lat1)
print("Longitude 1:", long1)

Latitude 1: [40.90415955 40.90415955 40.90415955 ... 40.7118988  40.7118988
 40.7118988 ]
Longitude 1: [-73.88786316 -73.88786316 -73.88786316 ... -73.93455505 -73.93455505
 -73.93455505]


In [102]:
#CHECKING
band1_values = image1.GetRasterBand(1).ReadAsArray()
band2_values = image1.GetRasterBand(2).ReadAsArray()
band3_values = image1.GetRasterBand(3).ReadAsArray()

band1_values = band1_values[band3_values != 0]
band2_values = band2_values[band3_values != 0]
band3_values = band3_values[band3_values != 0]

print(len(band1_values))
print(len(band2_values))
print(len(band3_values))

band4_values = image1.GetRasterBand(4).ReadAsArray()
print(np.unique(band4_values))

206245
206245
206245
[0. 7.]


In [103]:
!pip install pygeohash



In [104]:
import pygeohash as pgh
def encode_geohash(latitude, longitude, key=key1):
    return pgh.encode(latitude, longitude, key)

In [105]:
import pandas as pd

def DataframeGenerator(image1):
  band1_values = image1.GetRasterBand(1).ReadAsArray()
  band2_values = image1.GetRasterBand(2).ReadAsArray()
  band3_values = image1.GetRasterBand(3).ReadAsArray()

  # Get dimensions of the raster
  rows, cols = band1_values.shape

  # Create lists to store latitude, longitude, and pm2.5 values
  latitude = []
  longitude = []
  pm25_values = []

  # Iterate through each pixel
  for row in range(rows):
      for col in range(cols):
          # Append latitude, longitude, and pm2.5 values to lists
          latitude.append(band2_values[row, col])
          longitude.append(band3_values[row, col])
          pm25_values.append(band1_values[row, col])

  # Create DataFrame
  df = pd.DataFrame({'latitude': latitude, 'longitude': longitude, 'pm2.5 values': pm25_values})

  # Filter out rows with zero values
  df_filtered = df[(df != 0).all(axis=1)]

  df_filtered.reset_index(drop=True, inplace=True)

  # Apply the function to create the new "Geohash" column
  df_filtered['Geohash'] = df_filtered.apply(lambda row: encode_geohash(row['latitude'], row['longitude']), axis=1)

  df_filtered = df_filtered.drop_duplicates(subset='Geohash')
  df_filtered.reset_index(drop=True, inplace=True)

  return df_filtered

In [106]:
df1 = DataframeGenerator(image1)
df2 = DataframeGenerator(image2)

# Drop one row to observe RMSE_Calculator behavior when there are mismatched geohashes in one of the dataframes
df1 = df1.drop(1)
df1.reset_index(drop=True, inplace=True)

print(df1)
print(df2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Geohash'] = df_filtered.apply(lambda row: encode_geohash(row['latitude'], row['longitude']), axis=1)


       latitude  longitude  pm2.5 values  Geohash
0     40.904160 -73.887863      3.257859  dr72wwk
1     40.904160 -73.896103      3.496644  dr72wqq
2     40.904160 -73.894730      4.089306  dr72wqr
3     40.902786 -73.887863      3.624676  dr72wwh
4     40.902786 -73.886490      3.962771  dr72wwj
...         ...        ...           ...      ...
1162  40.783310 -73.929062      3.084182  dr72j8q
1163  40.746231 -73.845291      9.393966  dr5rz9e
1164  40.744858 -73.845291     10.670793  dr5rz97
1165  40.740738 -73.898849      3.084182  dr5ry2s
1166  40.711899 -73.934555      3.187280  dr5rte6

[1167 rows x 4 columns]
       latitude  longitude  pm2.5 values  Geohash
0     40.904160 -73.887863      3.115973  dr72wwk
1     40.904160 -73.886490      3.170306  dr72wwm
2     40.904160 -73.896103      3.629661  dr72wqq
3     40.904160 -73.894730      3.898599  dr72wqr
4     40.902786 -73.887863      3.146986  dr72wwh
...         ...        ...           ...      ...
1163  40.783310 -73.92906

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Geohash'] = df_filtered.apply(lambda row: encode_geohash(row['latitude'], row['longitude']), axis=1)


In [107]:
import pandas as pd

def Geohash_Mismatch_Fixer(df1, df2):
  geohashes_df1 = set(df1['Geohash'])
  geohashes_df2 = set(df2['Geohash'])

  missing_in_df1 = geohashes_df2 - geohashes_df1
  missing_in_df2 = geohashes_df1 - geohashes_df2

  missing_df1 = pd.DataFrame({'Geohash': list(missing_in_df1), 'pm2.5 values': [0] * len(missing_in_df1), 'latitude': [0] * len(missing_in_df1), 'longitude': [0] * len(missing_in_df1)})
  df1 = pd.concat([df1, missing_df1], ignore_index=True)

  missing_df2 = pd.DataFrame({'Geohash': list(missing_in_df2), 'pm2.5 values': [0] * len(missing_in_df2), 'latitude': [0] * len(missing_in_df2), 'longitude': [0] * len(missing_in_df2)})
  df2 = pd.concat([df2, missing_df2], ignore_index=True)

  return df1, df2

df1, df2 = Geohash_Mismatch_Fixer(df1, df2)

In [108]:
#print(df1,df2)
#merged_df = pd.merge(df1, df2, on='Geohash', suffixes=('_1', '_2'))
#print(merged_df)

In [109]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

def RMSE_Calculator(df1, df2):
  merged_df = pd.merge(df1, df2, on='Geohash', suffixes=('_1', '_2'))

  merged_df['squared_diff'] = (merged_df['pm2.5 values_1'] - merged_df['pm2.5 values_2'])**2

  mse = merged_df['squared_diff'].mean()

  rmse = np.sqrt(mse)

  return rmse

print(RMSE_Calculator(df1, df2))

0.6102206627448062


In [110]:
# prompt: search for geohash dr72wwm in merged_df

geohash_to_find = "dr72wwm"
found_row = merged_df.loc[merged_df['Geohash'] == geohash_to_find]

if not found_row.empty:
    print(f"Geohash {geohash_to_find} found:")
    print(found_row)
else:
    print(f"Geohash {geohash_to_find} not found in the merged DataFrame.")

print(df1)
print(df2)
print(merged_df)

Geohash dr72wwm found:
      latitude_1  longitude_1  pm2.5 values_1  Geohash  latitude_2  \
1167         0.0          0.0             0.0  dr72wwm    40.90416   

      longitude_2  pm2.5 values_2  
1167    -73.88649        3.170306  
       latitude  longitude  pm2.5 values  Geohash
0     40.904160 -73.887863      3.257859  dr72wwk
1     40.904160 -73.896103      3.496644  dr72wqq
2     40.904160 -73.894730      4.089306  dr72wqr
3     40.902786 -73.887863      3.624676  dr72wwh
4     40.902786 -73.886490      3.962771  dr72wwj
...         ...        ...           ...      ...
1163  40.746231 -73.845291      9.393966  dr5rz9e
1164  40.744858 -73.845291     10.670793  dr5rz97
1165  40.740738 -73.898849      3.084182  dr5ry2s
1166  40.711899 -73.934555      3.187280  dr5rte6
1167   0.000000   0.000000      0.000000  dr72wwm

[1168 rows x 4 columns]
       latitude  longitude  pm2.5 values  Geohash
0     40.904160 -73.887863      3.115973  dr72wwk
1     40.904160 -73.886490      3.17030

In [111]:
#squared_diff = (pm25_values_image1 - pm25_values_image2) ** 2

#mean_squared_diff = np.mean(squared_diff)

#rmse = np.sqrt(mean_squared_diff)

#print("RMSE:", rmse)

In [112]:
'''
main reading function - take reference image, iteratively go through each image in the folder, and store RMSE results in an array to pick the lowest
'''

'\nmain reading function - take reference image, iteratively go through each image in the folder, and store RMSE results in an array to pick the lowest\n'