In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from osgeo import gdal
import numpy as np
image1_path = '/content/drive/My Drive/Foundations_of_data_Science/Project/Newfolder/AQ_NYC_sample_1.tiff'
image2_path = '/content/drive/My Drive/Foundations_of_data_Science/Project/Newfolder/AQ_NYC_sample_2.tiff'

In [6]:
image1 = gdal.Open(image1_path)
image2 = gdal.Open(image2_path)

image1_pm25_values_layer1 = image1.GetRasterBand(1).ReadAsArray()
image1_pm25_values_layer4 = image1.GetRasterBand(4).ReadAsArray()

image2_pm25_values_layer1 = image2.GetRasterBand(1).ReadAsArray()
image2_pm25_values_layer4 = image2.GetRasterBand(4).ReadAsArray()

In [7]:
# prompt: find max of image1_pm25_values_layer1

print("Pm average max:", np.max(image1_pm25_values_layer1))
print("Pm average min:", np.min(image1_pm25_values_layer1[image1_pm25_values_layer1 != 0]))

Pm average max: 12.49835649
Pm average min: 2.061713825


In [33]:
non_zero_values1 = image1_pm25_values_layer4[image1_pm25_values_layer4 != 0]
non_zero_values2 = image2_pm25_values_layer4[image2_pm25_values_layer4 != 0]

key1 = min(np.unique(non_zero_values1))
key2 = min(np.unique(non_zero_values2))

print("Key 1:", key1)
print("Key 2:", key2)

Key 1: 7.0
Key 2: 7.0


In [19]:
lat_values1 = image1_pm25_values_layer1 = image1.GetRasterBand(2).ReadAsArray()
long_values1 = image1_pm25_values_layer1 = image1.GetRasterBand(3).ReadAsArray()

lat1 = lat_values1[lat_values1 != 0]
long1 = long_values1[long_values1 != 0]

print("Latitude 1:", lat1)
print("Longitude 1:", long1)


Latitude 1: [40.90415955 40.90415955 40.90415955 ... 40.7118988  40.7118988
 40.7118988 ]
Longitude 1: [-73.88786316 -73.88786316 -73.88786316 ... -73.93455505 -73.93455505
 -73.93455505]


In [29]:
#CHECKING
band1_values = image1.GetRasterBand(1).ReadAsArray()
band2_values = image1.GetRasterBand(2).ReadAsArray()
band3_values = image1.GetRasterBand(3).ReadAsArray()

band1_values = band1_values[band3_values != 0]
band2_values = band2_values[band3_values != 0]
band3_values = band3_values[band3_values != 0]

print(len(band1_values))
print(len(band2_values))
print(len(band3_values))

band4_values = image1.GetRasterBand(4).ReadAsArray()
print(np.unique(band4_values))

206245
206245
206245
[0. 7.]


In [21]:
band1_values = image1.GetRasterBand(1).ReadAsArray()
band2_values = image1.GetRasterBand(2).ReadAsArray()
band3_values = image1.GetRasterBand(3).ReadAsArray()

# Get dimensions of the raster
rows, cols = band1_values.shape

# Create lists to store latitude, longitude, and pm2.5 values
latitude = []
longitude = []
pm25_values = []

# Iterate through each pixel
for row in range(rows):
    for col in range(cols):
        # Append latitude, longitude, and pm2.5 values to lists
        latitude.append(band2_values[row, col])
        longitude.append(band3_values[row, col])
        pm25_values.append(band1_values[row, col])

# Create DataFrame
df = pd.DataFrame({'latitude': latitude, 'longitude': longitude, 'pm2.5 values': pm25_values})

# Display DataFrame
print(df)

         latitude  longitude  pm2.5 values
0             0.0        0.0           0.0
1             0.0        0.0           0.0
2             0.0        0.0           0.0
3             0.0        0.0           0.0
4             0.0        0.0           0.0
...           ...        ...           ...
2056819       0.0        0.0           0.0
2056820       0.0        0.0           0.0
2056821       0.0        0.0           0.0
2056822       0.0        0.0           0.0
2056823       0.0        0.0           0.0

[2056824 rows x 3 columns]


In [26]:
# Filter out rows with zero values
df_filtered = df[(df != 0).all(axis=1)]

df_filtered.reset_index(drop=True, inplace=True)

print(df_filtered)

         latitude  longitude  pm2.5 values
0       40.904160 -73.887863      4.326005
1       40.904160 -73.887863      4.326005
2       40.904160 -73.887863      4.326005
3       40.904160 -73.887863      4.326005
4       40.904160 -73.887863      4.326005
...           ...        ...           ...
206240  40.711899 -73.934555      3.187280
206241  40.711899 -73.934555      3.187280
206242  40.711899 -73.934555      3.187280
206243  40.711899 -73.934555      3.187280
206244  40.711899 -73.934555      3.187280

[206245 rows x 3 columns]


In [10]:
!pip install pygeohash



In [34]:
import pygeohash as pgh

def encode_geohash(latitude, longitude, key=key1):
    return pgh.encode(latitude, longitude, key)

# Apply the function to create the new "Geohash" column
df_filtered['Geohash'] = df_filtered.apply(lambda row: encode_geohash(row['latitude'], row['longitude']), axis=1)

# Display the DataFrame with the new "Geohash" column
print(df_filtered)

         latitude  longitude  pm2.5 values  Geohash
0       40.904160 -73.887863      4.326005  dr72wwk
1       40.904160 -73.887863      4.326005  dr72wwk
2       40.904160 -73.887863      4.326005  dr72wwk
3       40.904160 -73.887863      4.326005  dr72wwk
4       40.904160 -73.887863      4.326005  dr72wwk
...           ...        ...           ...      ...
206240  40.711899 -73.934555      3.187280  dr5rte6
206241  40.711899 -73.934555      3.187280  dr5rte6
206242  40.711899 -73.934555      3.187280  dr5rte6
206243  40.711899 -73.934555      3.187280  dr5rte6
206244  40.711899 -73.934555      3.187280  dr5rte6

[206245 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Geohash'] = df_filtered.apply(lambda row: encode_geohash(row['latitude'], row['longitude']), axis=1)


In [37]:
df_filtered = df_filtered.drop_duplicates(subset='Geohash')
df_filtered.reset_index(drop=True, inplace=True)
print(df_filtered)

       latitude  longitude  pm2.5 values  Geohash
0     40.904160 -73.887863      4.326005  dr72wwk
1     40.904160 -73.886490      3.115887  dr72wwm
2     40.904160 -73.896103      3.521091  dr72wqq
3     40.904160 -73.894730      3.960894  dr72wqr
4     40.902786 -73.887863      3.304012  dr72wwh
...         ...        ...           ...      ...
1163  40.783310 -73.929062      3.084182  dr72j8q
1164  40.746231 -73.845291      9.379833  dr5rz9e
1165  40.744858 -73.845291     10.647235  dr5rz97
1166  40.740738 -73.898849      3.084182  dr5ry2s
1167  40.711899 -73.934555      3.187280  dr5rte6

[1168 rows x 4 columns]


In [42]:
geohash_to_find = 'dr78856'
row_with_geohash = df_filtered[df_filtered['Geohash'] == geohash_to_find]
print(row_with_geohash)


    latitude  longitude  pm2.5 values  Geohash
42  40.88768 -73.824692      2.726714  dr78856


In [None]:
#squared_diff = (pm25_values_image1 - pm25_values_image2) ** 2

#mean_squared_diff = np.mean(squared_diff)

#rmse = np.sqrt(mean_squared_diff)

#print("RMSE:", rmse)

In [None]:
'''
main reading function - take reference image, iteratively go through each image in the folder, and store RMSE results in an array to pick the lowest
'''