In [1]:
# change the working directory to point to the root directory
import os

os.chdir("../")

In [2]:
# imports
import numpy as np
import pandas as pd

In [3]:
# imports for plots
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

pio.templates.default = "plotly_white"

In [4]:
# load data of an arbitrary region
data_region = pd.read_csv("raw_data/weather_regions/33.79102828115297;-3.67179828879979;Taza-Al Hoceima-Taounate;Guercif.csv")

In [5]:
data_region

Unnamed: 0.1,Unnamed: 0,DATE,ALLSKY_SFC_SW_DWN,PRECTOT,RH2M,T2M,T2MDEW,T2M_MAX,T2M_MIN,WS2M,crop_year,day,GDD,cumulative_GDD,cumulative_PRECTOT,cumulative_WS2M,cumulative_RH2M
0,287,1981-10-15,-99.00,0.04,25.61,22.06,1.17,30.90,15.99,1.25,1982,1,23.445,23.445,0.04,1.25,25.61
1,288,1981-10-16,-99.00,0.02,30.85,22.07,3.88,30.88,15.80,1.93,1982,2,23.340,46.785,0.06,3.18,56.46
2,289,1981-10-17,-99.00,0.08,33.90,21.88,4.96,31.41,14.02,2.24,1982,3,22.715,69.500,0.14,5.42,90.36
3,290,1981-10-18,-99.00,0.67,44.76,20.96,8.43,27.74,16.77,1.73,1982,4,22.255,91.755,0.81,7.15,135.12
4,291,1981-10-19,-99.00,0.03,30.72,21.76,3.49,28.68,17.74,1.62,1982,5,23.210,114.965,0.84,8.77,165.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10486,14431,2020-07-06,28.85,0.00,25.62,29.51,7.72,37.54,22.41,2.58,2020,265,29.975,3980.160,386.79,607.40,14831.60
10487,14432,2020-07-07,29.73,0.00,24.39,29.25,6.79,38.25,20.74,2.11,2020,266,29.495,4009.655,386.79,609.51,14855.99
10488,14433,2020-07-08,28.45,0.00,23.55,29.95,6.87,39.29,20.94,2.34,2020,267,30.115,4039.770,386.79,611.85,14879.54
10489,14434,2020-07-09,25.97,0.00,28.19,29.03,8.70,38.32,21.35,1.70,2020,268,29.835,4069.605,386.79,613.55,14907.73


In [6]:
from py_scripts.clustering import isolate_data_col
from py_scripts.clustering import reduce_merge_multivariate_data, multivariate_clustering
from py_scripts.clustering import get_best_number_clusters

In [7]:
# build a dict of isolated data
dict_data = {}
selected_cols = ["cumulative_GDD", "cumulative_PRECTOT", "cumulative_WS2M", "cumulative_RH2M"]

for col in selected_cols:
    dict_data[col] = isolate_data_col(data=data_region, col_name=col)

In [8]:
# reduce and merge data for multi clustering
reduced_data = reduce_merge_multivariate_data(dict_data, selected_cols)

In [9]:
# general params
nb_clusters = 3
linkage_method = "ward"
metric = "euclidean"

In [10]:
# perform the clustering
from sklearn.cluster import AgglomerativeClustering

X = reduced_data
model = AgglomerativeClustering(n_clusters=nb_clusters, linkage=linkage_method, affinity=metric)
model = model.fit(X)

labels = model.fit_predict(X)

In [11]:
multivariate_clustering(data_region)

{'High': Int64Index([2009, 2010, 2013], dtype='int64'),
 'Medium': Int64Index([1991, 1994, 1996, 2001, 2004, 2006, 2011, 2012, 2015, 2018], dtype='int64'),
 'Low': Int64Index([1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1992, 1993,
             1995, 1997, 1998, 1999, 2000, 2002, 2003, 2005, 2007, 2008, 2014,
             2016, 2017, 2019, 2020],
            dtype='int64')}

In [12]:
for i in range(nb_clusters):
    print([label for label in labels if label == i])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 2, 2]


In [13]:
query = data_region["crop_year"].isin([2009, 2001, 2020])

fig = px.line(
    data_region[query],
    x="day",
    y="cumulative_PRECTOT",
    color="crop_year"
)

fig.show()

In [14]:
# test C H
get_best_number_clusters(isolate_data_col(data_region, col_name='cumulative_PRECTOT'))

{2: 40.80195143633821,
 3: 44.12474219719995,
 4: 47.937024437263425,
 5: 45.31210847980031,
 6: 42.644513922508224,
 7: 40.234973360204215,
 8: 39.33989122054714,
 9: 38.87805285467735}

In [15]:
get_best_number_clusters(isolate_data_col(data_region, col_name='cumulative_WS2M'))

{2: 64.55048982279347,
 3: 51.38742861888922,
 4: 49.232493560303375,
 5: 48.37634970812191,
 6: 45.507805563576596,
 7: 42.89450759831663,
 8: 42.33523945008864,
 9: 42.00308872768002}

In [22]:
max(list(get_best_number_clusters(reduced_data).items()), key=lambda x: x[1])

(2, 36.133779392862266)