In [1]:
# set root directory
import os

os.chdir("../")
os.getcwd()

'c:\\Users\\HP\\Desktop\\clustering-moroccan-weather-data'

In [2]:
# imports
import numpy as np
import pandas as pd

In [3]:
# imports for plots
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

pio.templates.default = "plotly_white"

In [4]:
# load data
# weather variables to use
selected_cols = ["cumulative_GDD", "cumulative_PRECTOT", "cumulative_RH2M", "cumulative_WS2M"]

dict_data = {}

for col in selected_cols:
  pathname = f"data_beni_mellal/{col}.csv"

  dict_data[col] = pd.read_csv(pathname, index_col=0)

In [5]:
# array of cluster labels
# this arr is the output of the PCA notebook
cluster_labels = [0, 1, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 2, 1, 0, 0, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 0, 0, 0, 2, 0, 1, 2, 1, 1, 1, 1, 1]

In [6]:
# add cluster label col for each data frame
for col in selected_cols:
  dict_data[col]["cluster_index"] = cluster_labels

In [7]:
# proportion of each cluster
proportions = pd.Index(cluster_labels).value_counts().sort_index() / len(cluster_labels)

fig = go.Figure(data=[
                      go.Pie(values=proportions.values, labels=[f"cluster_{i}" for i in proportions.index])
])

fig.update_layout(
    title="Proportion of each cluster",
    height=500,
    width=800
)

fig.show()

In [8]:
# isolate each cluster
# weather variable: {cluster_1, cluster_2, ...}
dict_clusters = {}

# loop over weather var
for col in selected_cols:
  current_data = dict_data[col]
  dict_var_clusters = {}

  # for isolate cluster of var
  for i in range(len(proportions)):
    query = current_data["cluster_index"] == i

    # assign data cluster and drop cluster_index col
    dict_var_clusters[i] = current_data[query].drop(labels=["cluster_index"], axis=1)
  
  # assign
  dict_clusters[col] = dict_var_clusters

In [9]:
# check that the operation was successful
dict_clusters["cumulative_GDD"]

{0:            0       1       2       3        4        5        6        7  \
 1982  22.585  43.525  63.170  83.310  105.670  125.730  143.550  159.410   
 1987  15.120  30.360  44.920  57.385   71.615   88.290  105.080  122.190   
 1991  15.810  33.245  49.880  66.350   82.135   98.660  114.550  126.130   
 1996  18.110  37.030  56.930  76.215   94.000  109.830  125.050  140.685   
 1997  13.065  28.160  45.235  64.535   84.060  103.770  123.055  141.410   
 2009  14.525  31.440  48.785  65.625   81.315   95.260  108.810  120.910   
 2010  20.530  41.525  62.565  81.185   94.075  108.835  122.600  137.085   
 2011  15.005  28.495  44.530  62.000   77.630   93.230  108.915  124.885   
 2013  14.940  33.325  52.315  70.075   79.615   88.810  101.440  117.920   
 
             8        9  ...       259       260       261       262       263  \
 1982  173.420  187.760  ...  3427.155  3450.805  3468.390  3484.890  3505.575   
 1987  139.235  154.980  ...  3494.635  3518.785  3544.650  3

In [10]:
# plot the observations
# rows are weather vars
# cols are clusters

# arr of colors to identify cluster by color
arr_colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']

fig = make_subplots(rows=len(selected_cols), cols=len(proportions), 
                    row_titles=selected_cols, column_titles=[f"cluster_{i}" for i in range(len(proportions))])

row_index = 1
# loop over weather vars
for col in selected_cols:
  col_index = 1
  # loop over clusters within a weather var
  for i in range(len(proportions)):
    # select data
    data_to_plot = dict_clusters[col][i]

    for y_trace, crop_year in zip(data_to_plot.values, data_to_plot.index):
      fig.add_trace(
          go.Scatter(
              x=data_to_plot.columns, y=y_trace, 
              name=str(crop_year), marker_color=arr_colors[col_index - 1]),
      row=row_index, col=col_index)

      # y - axis range
      fig.update_yaxes(range=[0, np.max(dict_data[col].values)], row=row_index, col=col_index)
      # x - axis title
      fig.update_xaxes(title="days")

    col_index += 1

  row_index += 1


fig.update_layout(
    title="Plots for all clusters",
    showlegend=False,
)

fig.show()

In [11]:
# centroid of each cluster
dict_clusters_centroids = {}

for col in selected_cols:
  dict_clusters_centroids[col] = {}

  for i in range(len(proportions)):
    # mean of each cluster
    dict_clusters_centroids[col][i] = np.mean(dict_clusters[col][i], axis=0)

dict_clusters_centroids["cumulative_PRECTOT"]

{0: 0        0.178889
 1        0.278889
 2        0.405556
 3        1.594444
 4        4.631111
           ...    
 264    658.112222
 265    658.156667
 266    658.396667
 267    658.582222
 268    658.723333
 Length: 269, dtype: float64,
 1: 0        0.500000
 1        1.486667
 2        2.082778
 3        3.070556
 4        4.158889
           ...    
 264    346.493333
 265    346.513333
 266    346.518889
 267    346.525000
 268    346.723333
 Length: 269, dtype: float64,
 2: 0        0.689167
 1        0.972500
 2        3.959167
 3        4.948333
 4        5.870000
           ...    
 264    451.999167
 265    452.964167
 266    453.126667
 267    453.423333
 268    454.240833
 Length: 269, dtype: float64}

In [12]:
# plot centroids

fig = make_subplots(rows=len(selected_cols), cols=1, subplot_titles=[f"cluster centroids of {col}" for col in selected_cols])

row_index = 1
for col in selected_cols:
  for i in range(len(proportions)):
    fig.add_trace(
        go.Scatter(y=dict_clusters_centroids[col][i], name=f"{col}_cluster_{i}", marker_color=arr_colors[i]),
    row=row_index, col=1)

  row_index += 1


fig.update_layout(
   height=3000
)

fig.show()