# Nearest Station Merging 1

This code merges the processes forest data with the processed climate data using a geopandas function to join the points nearest one another.

In [None]:
import os
import pandas as pd
import geopandas as gpd
from datetime import datetime


climate_data = pd.read_csv('C:/Users/sixte/University of Toronto/Antoine Pepin - Big Project/Data/ClimateMonthly/all_filtered_monthly.csv', low_memory=False)
climate_data['Date'] = pd.to_datetime(climate_data['Date'])
climate_data = gpd.GeoDataFrame(climate_data, geometry=gpd.points_from_xy(climate_data['Longitude'], climate_data['Latitude']), crs=4269)
climate_data.to_crs(3347, inplace=True)

directory_path = 'C:/Users/sixte/University of Toronto/Antoine Pepin - Big Project/Data/Final DataFrames'
output_directory = 'C:/Users/sixte/University of Toronto/Antoine Pepin - Big Project/Data/Full Final DataFrames'

for root, dirs, files in os.walk(directory_path):
    for filename in files:
        if filename.endswith('.parquet'):
            file_path = os.path.join(root, filename)

            if os.path.getsize(file_path) == 0:
                print(f"Skipping empty file: {filename}")
                continue

            # Filenames include info on contents, namely year and month.
            file_components = filename.split('_')

            # Extract year and month from filename
            year = int(file_components[0])
            month = int(file_components[1].split('.')[0])

            # Create datetime object, all months are stored under the first.
            date = datetime(year, month, 1)

            try:
                dyear = pd.read_parquet(file_path, engine="pyarrow")
            except Exception as e:
                print(f"Error reading file {filename}: {e}")
                continue

            dyear = pd.read_parquet(file_path, engine="pyarrow")
            dyear.rename(columns={'geometry': 'pixel'}, inplace=True)
            dyear = gpd.GeoDataFrame(dyear, geometry=gpd.points_from_xy(dyear['lon'], dyear['lat']), crs=4269)
            dyear.to_crs(3347, inplace=True)

            # Perform spatial join using sjoin_nearest
            merged_data = gpd.sjoin_nearest(dyear, climate_data[climate_data['Date'] == date], how='left')

            # Save the merged_data to a parquet file in the output directory
            output_filename = f"{filename.split('.')[0]}_merged.parquet"
            output_path = os.path.join(output_directory, output_filename)
            merged_data.to_parquet(output_path, engine="pyarrow")

            print(f'Done with {output_filename}')