In [40]:
# mount drive
from google.colab import drive
drive.mount("/drive") 

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [41]:
# imports
import numpy as np
import pandas as pd

In [42]:
# imports for plots
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

pio.templates.default = "plotly_white"

In [43]:
# load data
# weather variables to use
selected_cols = ["cumulative_GDD", "cumulative_PREC", "cumulative_RH2M", "cumulative_WS2M"]

dict_data = {}

for col in selected_cols:
  pathname = f"/drive/My Drive/Colab Notebooks/AgriEdge/data_univariate_clustering/{col}.csv"

  dict_data[col] = pd.read_csv(pathname, index_col=0)

In [44]:
# array of cluster labels
# this arr is the output of the PCA notebook
cluster_labels = [2, 1, 1, 3, 1, 1, 1, 1, 2, 2, 3, 3, 1, 3, 0, 2, 2, 3, 3, 2, 1, 2, 2, 3, 2, 1, 1, 0, 0, 0, 2, 0, 1, 2, 3, 2, 0, 2, 1]# [2, 1, 1, 3, 1, 1, 1, 1, 2, 2, 3, 3, 1, 3, 0, 2, 2, 3, 3, 2, 1, 2, 2, 3, 2, 1, 1, 0, 0, 0, 2, 0, 1, 2, 3, 2, 0, 2, 1]# [2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 2, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 0, 1, 1, 1, 2, 1, 0, 2, 0, 2, 1, 2, 0]# [2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 2, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 0, 1, 1, 1, 2, 1, 0, 2, 0, 2, 1, 2, 0]

In [45]:
# add cluster label col for each data frame
for col in selected_cols:
  dict_data[col]["cluster_index"] = cluster_labels

In [46]:
# proportion of each cluster
proportions = pd.Index(cluster_labels).value_counts().sort_index() / len(cluster_labels)

fig = go.Figure(data=[
                      go.Pie(values=proportions.values, labels=[f"cluster_{i}" for i in proportions.index])
])

fig.update_layout(
    title="Proportion of each cluster",
    height=500,
    width=800
)

fig.show()

In [47]:
# isolate each cluster
# weather variable: {cluster_1, cluster_2, ...}
dict_clusters = {}

# loop over weather var
for col in selected_cols:
  current_data = dict_data[col]
  dict_var_clusters = {}

  # for isolate cluster of var
  for i in range(len(proportions)):
    query = current_data["cluster_index"] == i

    # assign data cluster and drop cluster_index col
    dict_var_clusters[i] = current_data[query].drop(labels=["cluster_index"], axis=1)
  
  # assign
  dict_clusters[col] = dict_var_clusters

In [48]:
# check that the operation was successful
dict_clusters["cumulative_GDD"]

{0:            0       1       2        3  ...       265       266       267       268
 1996  22.245  44.920  68.435   92.210  ...  4667.375  4694.140  4723.545  4752.040
 2009  20.105  41.610  62.770   81.745  ...  4331.435  4356.105  4381.525  4407.720
 2010  23.195  47.685  72.530   94.315  ...  4700.095  4728.175  4757.275  4789.055
 2011  19.170  38.560  57.130   77.070  ...  4667.030  4691.970  4718.065  4745.025
 2013  17.740  37.305  60.195   80.915  ...  4375.835  4407.670  4439.770  4472.900
 2018  29.080  54.885  80.900  101.905  ...  4271.815  4298.680  4325.505  4351.365
 
 [6 rows x 269 columns],
 1:            0       1       2       3  ...       265       266       267       268
 1983  20.065  39.435  58.740  76.260  ...  4446.760  4470.545  4494.630  4520.785
 1984  23.300  45.420  66.855  90.160  ...  4470.605  4502.525  4532.430  4563.220
 1986  26.565  51.240  75.815  99.570  ...  4489.845  4519.695  4550.915  4581.670
 1987  19.120  39.000  57.275  75.060  ...  468

In [49]:
# plot the observations
# rows are weather vars
# cols are clusters

# arr of colors to identify cluster by color
arr_colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']

fig = make_subplots(rows=len(selected_cols), cols=len(proportions), 
                    row_titles=selected_cols, column_titles=[f"cluster_{i}" for i in range(len(proportions))])

row_index = 1
# loop over weather vars
for col in selected_cols:
  col_index = 1
  # loop over clusters within a weather var
  for i in range(len(proportions)):
    # select data
    data_to_plot = dict_clusters[col][i]

    for y_trace, crop_year in zip(data_to_plot.values, data_to_plot.index):
      fig.add_trace(
          go.Scatter(
              x=data_to_plot.columns, y=y_trace, 
              name=str(crop_year), marker_color=arr_colors[col_index - 1]),
      row=row_index, col=col_index)

      # y - axis range
      fig.update_yaxes(range=[0, np.max(dict_data[col].values)], row=row_index, col=col_index)
      # x - axis title
      fig.update_xaxes(title="days")

    col_index += 1

  row_index += 1


fig.update_layout(
    title="Plots for all clusters",
    showlegend=False,
)

fig.show()

In [50]:
# centroid of each cluster
dict_clusters_centroids = {}

for col in selected_cols:
  dict_clusters_centroids[col] = {}

  for i in range(len(proportions)):
    # mean of each cluster
    dict_clusters_centroids[col][i] = np.mean(dict_clusters[col][i], axis=0)

dict_clusters_centroids["cumulative_PREC"]

{0: 0        0.443333
 1        0.446667
 2        0.546667
 3        3.156667
 4        6.958333
           ...    
 264    829.471667
 265    829.576667
 266    829.591667
 267    829.601667
 268    829.606667
 Length: 269, dtype: float64, 1: 0        0.766667
 1        1.379167
 2        2.187500
 3        3.868333
 4        5.386667
           ...    
 264    435.938333
 265    435.950000
 266    436.011667
 267    436.100833
 268    436.176667
 Length: 269, dtype: float64, 2: 0        1.640000
 1        2.840769
 2        4.640769
 3        7.122308
 4        9.406154
           ...    
 264    535.308462
 265    535.370000
 266    535.408462
 267    535.443846
 268    535.529231
 Length: 269, dtype: float64, 3: 0        1.21750
 1        4.67750
 2        6.58125
 3        8.42875
 4       10.32875
          ...    
 264    309.17250
 265    309.29625
 266    309.31750
 267    309.32625
 268    309.43625
 Length: 269, dtype: float64}

In [60]:
# plot centroids

fig = make_subplots(rows=len(selected_cols), cols=1, subplot_titles=[f"cluster centroids of {col}" for col in selected_cols])

row_index = 1
for col in selected_cols:
  for i in range(len(proportions)):
    fig.add_trace(
        go.Scatter(y=dict_clusters_centroids[col][i], name=f"{col}_cluster_{i}", marker_color=arr_colors[i]),
    row=row_index, col=1)

  row_index += 1


fig.update_layout(
   height=3000
)

fig.show()