In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
from sktime.dists import dtw_dist
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import ward, dendrogram, fcluster
import folium
from tslearn.barycenters import dtw_barycenter_averaging
from shapely.geometry import Point
import matplotlib.pyplot as plt
import random
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import ward, dendrogram, fcluster
from fastdtw import fastdtw
import matplotlib.pyplot as plt
#FOR SUBMISSION

Get data and group counties

In [None]:
counties = gpd.read_file()
points_df = pd.read_csv()
if 'longitude' not in points_df.columns or 'latitude' not in points_df.columns:
    raise ValueError("check for the lat lon columns.")
geometry = [Point(xy) for xy in zip(points_df['longitude'], points_df['latitude'])]
points = gpd.GeoDataFrame(points_df, geometry=geometry)
points.set_crs(epsg=4326, inplace=True) 
counties = counties.to_crs(points.crs)
points_with_counties = gpd.sjoin(points, counties, how='left', predicate='within')
points_with_counties = points_with_counties[['geometry', 'CountyName'] + points_df.columns.tolist()]
points_with_counties.drop(columns='geometry').to_csv('points_with_counties.csv', index=False)

Differece the ts to remove trends etc

In [None]:
path = ''
df = pd.read_csv(path, parse_dates=['date'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

location = df[['latitude', 'longitude']].drop_duplicates().sample(1)
lat, lon = location.iloc[0]['latitude'], location.iloc[0]['longitude']
loc_df = df[(df['latitude'] == lat) & (df['longitude'] == lon)]

def adf_test(series):
    result = adfuller(series.dropna())
    return result[1]  


plt.figure(figsize=(10, 6))
plt.plot(loc_df['soil_moisture'], label='Original Series')
plt.title(f'Original SM \n(Loc: {lat}, {lon})')
plt.xlabel('Date')
plt.ylabel('SM')
plt.legend()
plt.show()

loc_df['soil_moisture_diff1'] = loc_df['soil_moisture'].diff()
p_value_diff1 = adf_test(loc_df['soil_moisture_diff1'])

plt.figure(figsize=(10, 6))
plt.plot(loc_df['soil_moisture_diff1'], label='1st Diff')
plt.title(f'1ST Differenced SM\n(Loc: {lat}, {lon}) - ADF p-value: {p_value_diff1:.5f}')
plt.xlabel('Date')
plt.ylabel('Differenced SM')
plt.legend()
plt.show()

loc_df['soil_moisture_diff2'] = loc_df['soil_moisture_diff1'].diff()
p_value_diff2 = adf_test(loc_df['soil_moisture_diff2'])

plt.figure(figsize=(10, 6))
plt.plot(loc_df['soil_moisture_diff2'], label='Second Diff')
plt.title(f'2ND Differenced SM Time Series\n(Location: {lat}, {lon})')
plt.xlabel('Date')
plt.ylabel('2nd Differenced SM')
plt.legend()
plt.show()

print(f'ADF p-value for raw: {adf_test(loc_df["soil_moisture"])}')
print(f'ADF p-value first diff: {p_value_diff1}')
print(f'ADF p-value second diff: {p_value_diff2}')


In [None]:
counties = gpd.read_file()
points_df = pd.read_csv()

if 'longitude' not in points_df.columns or 'latitude' not in points_df.columns:
    raise ValueError(" NO lat, lon cols.")

geometry = [Point(xy) for xy in zip(points_df['longitude'], points_df['latitude'])]
points = gpd.GeoDataFrame(points_df, geometry=geometry)
points.set_crs(epsg=4326, inplace=True)  
counties = counties.to_crs(points.crs)
points_with_counties = gpd.sjoin(points, counties, how='left', predicate='within')
points_with_counties = points_with_counties[['geometry', 'CountyName'] + points_df.columns.tolist()]
grouped = points_with_counties.groupby('CountyName')
specified_counties = ['Story', 'Page', 'Hancock', 'Jones', 'Wapello', 'Davis']


def process_county(df):
    df['location'] = df.apply(lambda row: f"{row['latitude']}_{row['longitude']}", axis=1)
    new_df = df.pivot(index='date', columns='location', values='soil_moisture')
    scaler = StandardScaler()
    standarised_df = pd.DataFrame(scaler.fit_transform(new_df), index=new_df.index, columns=new_df.columns)
    all_series = [standarised_df[col].dropna().values.reshape(-1, 1) for col in standarised_df.columns]

perform clustering using DTW and agglomerative 
- using ward method 
- manually defnign dist cut off

In [None]:
dist_threshold = 90

def dtw_dist(series1, series2):
    dist, _ = fastdtw(series1, series2, dist=lambda x, y: np.abs(x - y))
    return dist

combin = pd.concat([grouped.get_group(county) for county in specified_counties])
combin['location'] = combin.apply(lambda row: f"{row['latitude']}_{row['longitude']}", axis=1)
new_df = combin.pivot(index='date', columns='location', values='soil_moisture')
scaler = StandardScaler()
standarised_df = pd.DataFrame(scaler.fit_transform(new_df), index=new_df.index, columns=new_df.columns)
all_series = [standarised_df[col].dropna().values.reshape(-1, 1) for col in standarised_df.columns]
series = len(all_series)
matrix = np.zeros((series, series))
for i in range(series):
    for j in range(i + 1, series):
        matrix[i, j] = dtw_dist(all_series[i], all_series[j])
        matrix[j, i] = matrix[i, j]

new_matrix = squareform(matrix, checks=False)

linkge_matrix = ward(new_matrix)

plt.figure(figsize=(20, 10))
dendrogram(linkge_matrix, labels=standarised_df.columns, leaf_rotation=90, leaf_font_size=8)
plt.title("Dendrogram for All Locations cross Countiy")
plt.xlabel('Loc')
plt.ylabel('Dist')
plt.axhline(y=dist_threshold, color='r', linestyle='--')
plt.show()

clust_label = fcluster(linkge_matrix, dist_threshold, criterion='dist')
combin['cluster'] = clust_label[combin['location'].map({location: idx for idx, location in enumerate(standarised_df.columns)})]
def colourClusters(clust_label):
    each_ans = np.unique(clust_label)
    color_map = plt.cm.get_cmap('viridis', len(each_ans))
    clustcol = {cluster: color_map(i) for i, cluster in enumerate(each_ans)}
    return clustcol

clustcol = colourClusters(clust_label)
m = folium.Map(location=[combin['latitude'].mean(), combin['longitude'].mean()], zoom_start=7)
for _, row in combin.iterrows():
    color = f"#{int(clustcol[row['cluster']][0] * 255):02x}{int(clustcol[row['cluster']][1] * 255):02x}{int(clustcol[row['cluster']][2] * 255):02x}"
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=8,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.6,
        popup=f"Cluster: {row['cluster']}, Location: {row['location']}, County: {row['CountyName']}"
    ).add_to(m)

m


plot on map

In [None]:
def diffColourClust(clust_label):
    each_ans = np.unique(clust_label)
    n_clusters = len(each_ans)
    colormap = plt.get_cmap('tab20', n_clusters)
    clustcol = {cluster: colormap(i) for i, cluster in enumerate(each_ans)}
    return clustcol

clustcol = diffColourClust(clust_label)
m = folium.Map(location=[combin['latitude'].mean(), combin['longitude'].mean()], zoom_start=7)
for _, row in combin.iterrows():
    color = f"#{int(clustcol[row['cluster']][0] * 255):02x}{int(clustcol[row['cluster']][1] * 255):02x}{int(clustcol[row['cluster']][2] * 255):02x}"
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=8,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.6,
        popup=f"Cluster: {row['cluster']}, Location: {row['location']}, County: {row['CountyName']}"
    ).add_to(m)

m
