# ***Urban Heat Island (UHI) Index using features from Sentinel-2 satelite dataset as predictor variables***

## Loading in Dependencies

In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# installing modules to the environment
!pip install geopandas

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd

# Multi-dimensional arrays and datasets
import xarray as xr

# Geospatial raster data handling
import rioxarray as rxr

# Geospatial data analysis
import geopandas as gpd
from shapely.geometry import Point

# Geospatial operations
import rasterio
from rasterio import windows
from rasterio import features
from rasterio import warp
from rasterio.warp import transform_bounds
from rasterio.windows import from_bounds

#Image Processing
from PIL import Image

# Coordinate transformations
from pyproj import Proj, Transformer, CRS

# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Planetary Computer Tools
import pystac_client
import planetary_computer as pc
from pystac.extensions.eo import EOExtension as eo

# Others
import os
from tqdm import tqdm

## Response Variable

### Loading training data

In [None]:
ground_df = pd.read_csv("Training_data_uhi_index_2025-02-18.csv")
ground_df.head()

## Predictor variables

### Downloading GeoTIFF Image

In [None]:
''' read and plot all the six bands from the GeoTIFF file (B04, B08, B06, B01, B11, B03) '''
# open the GeoTIFF file
tiff_path = "S2_sample.tiff"

'''# inspecting dimensions and coordinates using xarray(multi-dimensional)
data = rxr.open_rasterio(tiff_path)

# dimensions and coordinates
print("Dimensions:", data.dims)
print("\nCoordinates")
print(data.coords)

# metadata
print("\nAttributes:")
print(data.attrs)

# inspecting dimensions and coordinates using rasterio
with rasterio.open(tiff_path) as src:
    # metadata info
    print("Metadata:")
    print(src.meta)

    # dimension names
    print("\nDimensions (width, height):", src.width, src.height)
    print("CRS:", src.crs)

    # List all bands
    print("\nNumber of Bands:", src.count)
    for i in range(1, src.count + 1):
        print(f"Band {i}:", src.descriptions[i - 1])'''

# Read bands from GeoTIFF
with rasterio.open(tiff_path) as src1:
    band1 = src1.read(1) # band B01
    band2 = src1.read(2) # band B03
    band3 = src1.read(3) # band B04
    band4 = src1.read(4) # band B05
    band5 = src1.read(5) # band B06
    band6 = src1.read(6) # band B07
    band7 = src1.read(7) # band B08
    band8 = src1.read(8) # band B11

# plot bands in 2x3 grid
fig, axes = plt.subplots(2, 4, figsize=(10, 10))

# Flatten the axes for easier indexing
axes = axes.flatten()

# Plot the first band (B01)
im1 = axes[0].imshow(band1, cmap='viridis')
axes[0].set_title('Band [B01]')
fig.colorbar(im1, ax=axes[0])

# Plot the second band (B03)
im2 = axes[1].imshow(band2, cmap='viridis')
axes[1].set_title('Band [B03]')
fig.colorbar(im2, ax=axes[1])

# Plot the third band (B04)
im3 = axes[2].imshow(band3, cmap='viridis')
axes[2].set_title('Band [B04]')
fig.colorbar(im3, ax=axes[2])

#Plot the fourth band (B05)
im4 = axes[3].imshow(band4, cmap='viridis')
axes[3].set_title('Band [B05]')
fig.colorbar(im4, ax=axes[3])

#Plot the fifth band (B06)
im5 = axes[4].imshow(band5, cmap='viridis')
axes[4].set_title('Band [B06]')
fig.colorbar(im5, ax=axes[4])

# Plot the sixth band (B07)
im6 = axes[5].imshow(band6, cmap='viridis')
axes[5].set_title('Band [B07]')
fig.colorbar(im6, ax=axes[5])

# Plot the seventh band (B08)
im7 = axes[6].imshow(band7, cmap='viridis')
axes[6].set_title('Band [B08]')
fig.colorbar(im7, ax=axes[6])

# Plot the eighth band (B11)
im8 = axes[7].imshow(band8, cmap='viridis')
axes[7].set_title('Band [B11]')
fig.colorbar(im8, ax=axes[7])

plt.tight_layout()
plt.show()

### 

### Extracting Band Values from the GeoTIFF Image

In [None]:
# preload data into memory
csv_path = "Training_data_uhi_index_2025-02-18.csv"

In [None]:
# Extracts satellite band values from a GeoTIFF based on coordinates from a csv file and returns them in a DataFrame.
# This is a single point data extraction!

#df = pd.DataFrame()
def map_satellite_data(tiff_path, csv_path):

    # load GeoTIFF data
    data = rxr.open_rasterio(tiff_path)
    tiff_crs = data.rio.crs

    # Read the Excel file using pandas
    df = pd.read_csv(csv_path)

    # checking for missing or NaN vaalues
    print(df.isnull().sum())

    # checking GeoTIFF Bounds
    print(data.rio.bounds())
    latitudes = df['Latitude'].values
    longitudes = df['Longitude'].values

    # convert lat/long to GeoTIFF's CRS
    # create Proj object for EPSG:4236 (WGS84 - lat/long) and GeoTIFF's CRS
    proj_wgs84 = Proj('EPSG:4326') # EPSG:4236 is the common lat/long CRS
    proj_tiff = Proj(tiff_crs)

    # Create a transformer object
    transformer = Transformer.from_crs('EPSG:4326', tiff_crs, always_xy=True)

    B01_values = []
    B03_values = []
    B04_values = []
    B05_values = []
    B06_values = []
    B07_values = []
    B08_values = []
    B11_values = []

    # loop over the latitudes and longitudes and extract corresponding values
    # testing a small sample
    #subset_latitudes = latitudes[:10]
    #subset_longitudes = longitudes[:10]

    # check input data
    print(df.dtypes)
    print(df[['Latitude', 'Longitude']].head())

    # verifying iteration logic
    print(f"Number of coordinates to process: {len(latitudes)}")
    print(f"Example lat/lon pairs: {list(zip(latitudes, longitudes))[:5]}")

    for lat, lon in tqdm(zip(latitudes, longitudes), total=len(latitudes), desc="Mapping values"):
        # assuming correct dimensions are 'y' and 'x'
        try:
            x, y = transformer.transform(lon, lat)

            #testing transformed coordinates with 'data.sel()'
            #sample_lat, sample_lon = latitudes[0], longitudes[0]
            #x,y = transformer.transform(sample_lon, sample_lat)
            #print(f"Sample transformed coordinates: x={x}, y={y}")
            #print(data.sel(x=x, y=y, band=1, method="nearest").values)

            B01_values.append(data.sel(x=x, y=y, band=1, method="nearest").values)
            B03_values.append(data.sel(x=x, y=y, band=2, method="nearest").values)
            B04_values.append(data.sel(x=x, y=y, band=3, method="nearest").values)
            B05_values.append(data.sel(x=x, y=y, band=4, method="nearest").values)
            B06_values.append(data.sel(x=x, y=y, band=5, method="nearest").values)
            B07_values.append(data.sel(x=x, y=y, band=6, method="nearest").values)
            B08_values.append(data.sel(x=x, y=y, band=7, method="nearest").values)
            B11_values.append(data.sel(x=x, y=y, band=8, method="nearest").values)
            #print(f"Transformed coordinates: x={x}, y={y}")

        except Exception as e:
            print(f"Error transforming coordinates lat={lat}, lon={lon}: {e}")
            continue

        except KeyError:
            # Handle coordinates outside the bounds of the GeoTIFF
            print(f"Coordinates out of bounds: lat={lat}, lon={lon}")
            B01_values.append(None)
            B03_values.append(None)
            B04_values.append(None)
            B06_values.append(None)
            B08_values.append(None)
            B11_values.append(None)

    # create dataframe with the band values to store them
    final_df = pd.DataFrame({
        'B01' : B01_values,
        'B03' : B03_values,
        'B04' : B04_values,
        'B05' : B05_values,
        'B06' : B06_values,
        'B07' : B07_values,
        'B08' : B08_values,
        'B11' : B11_values
    })

    return final_df

In [None]:
# Mapping satellite data with training data
final_data = map_satellite_data('S2_sample.tiff', 'Training_data_uhi_index_2025-02-18.csv')

In [None]:
print(final_data.head())
print(final_data.describe())

#### Median composite

In [None]:
# create the median composite
#data = rxr.open_rasterio("S2_sample.tiff")
#data = data.rio.write_crs("EPSG:4326")

In [None]:
# calculate median composite along the band dimension
#median_composite = data.median(dim="band").compute()
#median.rio.to_raster("median_composite.tiff")
#print(median_composite)
#print(median_composite.dims)
#print(median_composite.coords)

#### Calculate NDVI (Normalized Difference Vegetation Index)

In [None]:
# load median
#median = rxr.open_rasterio("median_composite.tiff")
#red = median.sel(band=4)
#nir = median.sel(band=8)
#ndvi = (nir - red) / (nir + red)
#final_data['NDVI'] = ndvi
final_data['NDVI'] = (final_data['B08'] - final_data['B04']) / (final_data['B08'] + final_data['B04'])
# handle division by zero by replacing infinites with NaN
final_data['NDVI'] = final_data['NDVI'].replace([np.inf, -np.inf], np.nan)

#### Calculate NDBI (Normalized Difference Buildup Index)

In [None]:
final_data['NDBI'] = (final_data['B11'] - final_data['B08']) / (final_data['B11'] + final_data['B08'])
# handle division by zero by replacing infinites with NaN
final_data['NDBI'] = final_data['NDBI'].replace([np.inf, -np.inf], np.nan)

#### Calculate NDWI (Normalized Difference Water Index)

In [None]:
final_data['NDWI'] = (final_data['B03'] - final_data['B08']) / (final_data['B03'] + final_data['B08'])
# handle division by zero by replacing infinites with NaN
final_data['NDWI'] = final_data['NDWI'].replace([np.inf, -np.inf], np.nan)

### Joining the predictor variables and response variables

In [None]:
# function to combine two datasets vertically (along columns) using pandas concat function.
def combine_two_datasets(dataset1, dataset2):
    '''
    Returns a vertically concantenated dataset.
    Attributes:
    dataset1 - Dataset 1 to be combined
    dataset2 - Dataset 2 to be combined
    '''

    data = pd.concat([dataset1,dataset2], axis=1)
    return data

In [None]:
# combining ground data and final data into a single dataset
uhi_data = combine_two_datasets(ground_df,final_data)
uhi_data.head()

#### Removing duplicates

In [None]:
# Remove duplicate rows from the dataframe based on specific columns, keep the first occurrence
columns_to_check = ['B01','B03','B04','B05', 'B06','B07', 'B08','B11','NDVI','NDBI','NDWI']
for col in columns_to_check:
    # Check if the value is a numpy array and has more than one dimension
    uhi_data[col] = uhi_data[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# now remove duplicates
uhi_data = uhi_data.drop_duplicates(subset=columns_to_check, keep='first')
uhi_data.head()

In [None]:
# Resetting the index of the dataset
uhi_data=uhi_data.reset_index(drop=True)

## Model Building

In [None]:
# Retaining only columns for use in the training dataset
uhi_data = uhi_data[['B01', 'B05', 'B06', 'B07', 'NDVI', 'UHI Index']]

### Split data for training and testing

In [None]:
# Split the data into features (X) and target (Y)
X = uhi_data.drop(columns=['UHI Index']).values
Y = uhi_data ['UHI Index'].values

# 70/30 split for train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

### Feature Scaling

In [None]:
# scale the training and test data using standard scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Model Training

In [None]:
# train the random forest model on training data
model = RandomForestRegressor(n_estimators=100, random_state=123)
# fit the training data to the model
model.fit(X_train, Y_train)

## Model Evaluation

### In-Sample Evaluation

In [None]:
# make predictions on the training data
insample_predictions = model.predict(X_train)

In [None]:
# calculate R-squared score for insample_predictions
y_train = Y_train.tolist()
r2_score(y_train, insample_predictions)

### Out-Sample Evaluation

In [None]:
# Make predictions on the test data
outsample_predictions = model.predict(X_test)

In [None]:
# calculate R-squared score for outsample_predictions
y_test = Y_test.tolist()
r2_score(y_test, outsample_predictions)

## Submission

In [None]:
# Reading the coordinates for submission
test_file = pd.read_csv('Submission_template_UHI2025-v2.csv')
test_file.head()

In [None]:
# Mapping satelite data for submission
val_data = map_satellite_data('S2_sample.tiff', 'Submission_template_UHI2025-v2.csv')

In [None]:
# Calculate NDVI (Normalized Difference Vegetation Index)
val_data['NDVI'] = (val_data['B08'] - val_data['B04']) / (val_data['B08'] + val_data['B04'])
# handle devision by zero by replacing infinites with NaN
val_data['NDVI'] = val_data['NDVI'].replace([np.inf, -np.inf], np.nan)

In [None]:
val_data.head()

In [None]:
# Extracting specific columns from validation dataset
submission_val_data=val_data.loc[:,['B01','B05','B06','B07','NDVI']]
submission_val_data.head()

In [None]:
# Feature Scaling
submission_val_data = submission_val_data.values
transformed_submission_data = sc.transform(submission_val_data)

In [None]:
# Making predictions
final_predictions = model.predict(transformed_submission_data)
final_prediction_series = pd.Series(final_predictions)

In [None]:
# Combining the results into dataframe
submission_df = pd.DataFrame({'Longitude':test_file['Longitude'].values, 'Latitude':test_file['Latitude'].values, 'UHI Index':final_prediction_series.values})

In [None]:
# Displaying the sample submission dataframe
submission_df.head()

In [None]:
# Dumping the predictions into a csv file
submission_df.to_csv("submission.csv", index = False)