In [None]:
!pip install geopy

In [None]:
pcs_folder_path = (
    'Data/pcs'  # Some observations were recorded twice. One with and one without gps data
)
output_file_path = 'Data/pcs/pcs.csv'

plot_latlon_graph = False

In [None]:
import glob
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from geopy.distance import great_circle
from geopy.point import Point
from sklearn.cluster import DBSCAN

sys.path.append('../..')
from Utils.vpd import calculate_vpd

# Extract latlon of each site location using DBSCAN

In [None]:
def save_latlon_graph(df, station_name):
    df = df.dropna(subset=['Stn_lat', 'Stn_long'])
    df = df[(df['Stn_lat'] != 999) & (df['Stn_long'] != 999)]
    plt.scatter(df['Stn_long'], df['Stn_lat'], s=1)
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title(f'GPS Locations of station {station_name}')
    plt.grid(True)
    plt.savefig(f'latlon_plots/station_{station_name}')


def find_station_location(df, station_name):
    # Prepare coordinates
    latlon_df = df[['Stn_lat', 'Stn_long']]
    latlon_df = latlon_df.drop_duplicates()
    latlon_df = latlon_df.dropna()
    coords = latlon_df.to_numpy()

    # Run DBSCAN
    kms_per_radian = 6371.0088
    epsilon = 0.05 / kms_per_radian  # 50 meters radius
    db = DBSCAN(eps=epsilon, min_samples=10, algorithm='ball_tree', metric='haversine').fit(
        np.radians(coords)
    )
    latlon_df['cluster'] = db.labels_

    # Get cluster centers
    locations = latlon_df.groupby('cluster')[['Stn_lat', 'Stn_long']].mean()
    locations = locations.loc[locations.index != -1]  # exclude noise
    print(f"locations fo station {station_name} are", locations)

    return locations, latlon_df

In [None]:
glob.glob(os.path.join('..', '..', pcs_folder_path, 'fts*.csv'))

In [None]:
locations_dates_df_list = []
df_list = []
for i, file_name in enumerate(glob.glob(os.path.join('..', '..', pcs_folder_path, 'fts*.csv'))):
    df = pd.read_csv(file_name)
    print(f"Station names: {df['Station name'].unique()}")
    station_name = df['Station name'][0]

    if plot_latlon_graph:
        save_latlon_graph(df, station_name)
    locations, latlon_df = find_station_location(df, station_name)
    locations_df = locations.reset_index()

    # Find the start and end dates of each cluster
    df = df.merge(latlon_df, how='left', on=['Stn_lat', 'Stn_long'])
    df['Date'] = pd.to_datetime(df['Date'])
    start_end_dates_df = df.groupby('cluster')['Date'].agg(['min', 'max']).reset_index()
    start_end_dates_df.rename(
        columns={'min': 'start_date_UTC', 'max': 'end_date_UTC'}, inplace=True
    )
    locations_dates_df = locations_df.merge(start_end_dates_df, on='cluster', how='left')

    # Save locations_dates_df together in a list
    locations_dates_df['Station_name'] = station_name
    locations_dates_df_list.append(locations_dates_df)

    # Save important site data together in a list
    df = df[['Date', 'T', 'H', 'MF', 'MS', 'Station name', 'cluster']]
    df_list.append(df[(~df['cluster'].isna()) & (df['cluster'] != -1)])

In [None]:
# Visualise sites_info_df

sites_info_df = pd.concat(locations_dates_df_list, ignore_index=True)
sites_info_df

In [None]:
# Clean and save sites_info_df

sites_info_df['start_date_UTC'] = sites_info_df['start_date_UTC'].dt.date
sites_info_df['end_date_UTC'] = sites_info_df['end_date_UTC'].dt.date
sites_info_df.drop(['cluster'], axis=1).to_csv("sites_latlon_dates.csv", index=False)

# Prepare pcs.csv

In [None]:
# Assign SiteID to each cluster

sites_info_df['SiteID'] = [f'pcs_{i+1}' for i in range(len(sites_info_df))]
sites_info_df

In [None]:
# Combine all site data together and get the SiteID

df_all = pd.concat(df_list, ignore_index=True)
df_all = df_all.merge(
    sites_info_df[['Station_name', 'cluster', 'SiteID', 'Stn_lat', 'Stn_long']],
    left_on=['Station name', 'cluster'],
    right_on=['Station_name', 'cluster'],
    how='left',
)
df_all.head()

In [None]:
# Rename columns
df_all.rename(
    columns={
        'Stn_long': 'X',
        'Stn_lat': 'Y',
        'Date': 'Datetime',
        'T': 'Temperature',
        'H': 'RH',
        'MF': 'DFMC',
        'MS': 'Soil_mois',
    },
    inplace=True,
)
df_all['VPD'] = df_all.apply(lambda row: calculate_vpd(row['Temperature'], row['RH']), axis=1)
df_all.head()

In [None]:
# Save the df_all

df_all = df_all[['SiteID', 'X', 'Y', 'Datetime', 'Temperature', 'RH', 'VPD', 'DFMC', 'Soil_mois']]
df_all.to_csv(os.path.join('..', '..', output_file_path), index=False)
df_all

In [None]:
# Check NaNs
df_all[df_all.isna().any(axis=1)]