In [None]:
!pip install geopy

# Extract latlon of each site location using DBSCAN

In [None]:
import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from geopy.distance import great_circle
from geopy.point import Point
from sklearn.cluster import DBSCAN


def save_latlon_graph(df, station_name):
    df = df.dropna(subset=['Stn_lat', 'Stn_long'])
    df = df[(df['Stn_lat'] != 999) & (df['Stn_long'] != 999)]
    plt.scatter(df['Stn_long'], df['Stn_lat'], s=1)
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title(f'GPS Locations of station {station_name}')
    plt.grid(True)
    plt.savefig(f'latlon_plots/station_{station_name}')


def find_station_location(df, station_name):
    # Prepare coordinates
    latlon_df = df[['Stn_lat', 'Stn_long']]
    latlon_df = latlon_df.drop_duplicates()
    latlon_df = latlon_df.dropna()
    coords = latlon_df.to_numpy()

    # Run DBSCAN
    kms_per_radian = 6371.0088
    epsilon = 0.05 / kms_per_radian  # 50 meters radius
    db = DBSCAN(eps=epsilon, min_samples=10, algorithm='ball_tree', metric='haversine').fit(
        np.radians(coords)
    )
    latlon_df['cluster'] = db.labels_

    # Get cluster centers
    locations = latlon_df.groupby('cluster')[['Stn_lat', 'Stn_long']].mean()
    locations = locations.loc[locations.index != -1]  # exclude noise
    print(f"locations fo station {station_name} are", locations)

    return locations, latlon_df


locations_dates_df_list = []
for i, file_name in enumerate(glob.glob("fts*.csv")):
    df = pd.read_csv(file_name)
    print(f"Station names: {df['Station name'].unique()}")
    station_name = df['Station name'][0]

    save_latlon_graph(df, station_name)
    locations, latlon_df = find_station_location(df, station_name)
    locations_df = locations.reset_index()

    # Find the start and end dates of each cluster
    df = df.merge(latlon_df, how='left', on=['Stn_lat', 'Stn_long'])
    df['Date'] = pd.to_datetime(df['Date'])
    start_end_dates_df = df.groupby('cluster')['Date'].agg(['min', 'max']).reset_index()
    start_end_dates_df.rename(
        columns={'min': 'start_date_UTC', 'max': 'end_date_UTC'}, inplace=True
    )
    locations_dates_df = locations_df.merge(start_end_dates_df, on='cluster', how='left')

    locations_dates_df['Station_name'] = station_name
    locations_dates_df_list.append(locations_dates_df)

sites_info_df = pd.concat(locations_dates_df_list)
sites_info_df

In [None]:
# Clean and save sites_info_df
sites_info_df['start_date_UTC'] = sites_info_df['start_date_UTC'].dt.date
sites_info_df['end_date_UTC'] = sites_info_df['end_date_UTC'].dt.date
sites_info_df = sites_info_df.round(4).drop(['cluster'], axis=1)
sites_info_df.to_csv("sites_latlon_dates.csv", index=False)

# Find the first observation date

In [None]:
# Done manually: 2020-01-01