In [None]:
%%capture
%run 1-setup.ipynb

In [None]:
from tslearn.clustering import TimeSeriesKMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
# let's standardize/normalize the numerical features of the dataframe
for df in dataframes.values():
    df[numerics] = MinMaxScaler().fit_transform(df[numerics])

In [None]:
dataframes['modena']

Time Series Kmeans requires a 3D array, so let's create it from all the datasets

In [None]:
provinces = dataframes.keys()

# Prepare input data
X = []

for province in provinces:
    group_data = dataframes[province][numerics].values
    
    # Reshape to (n_samples, n_timestamps, n_features)
    group_data = np.expand_dims(group_data, axis=0)
    X.append(group_data)

# Stack the list of arrays to create a 3D array
X = np.vstack(X)

In [None]:
X

Now we have to choose the optimal number of clusters. To do this, we'll run kmeans different times for different cluster sizes.
Each time we'll save the total inertia, which is a measure of goodness of fit for clustering. It represents the sum of squared distances of all observations from the respective cluster center. The lower the inertia, the more the clusters are concentrated around the centroids. For time series data, the best choice is to use Dinamic Time Warping (dtw) as metric for clustering, since Euclidean distance is not invariant to time shifts

In [None]:
inertia = []
K = list(range(2, 9))

for k in K:
    km = TimeSeriesKMeans(n_clusters=k, n_init=5, metric='dtw', random_state=0)
    
    km = km.fit(X)
    
    inertia.append(km.inertia_)

Considering both preprocessing steps (standardization and min-max scaling) by looking at the 2 figures for the elbow method, we can deduce that it's unlikely KMeans did a good job at clustering, as there is not a clear difference for the various Ks. Ideally, we should see a number of clusters that has a clear drop with respect to the previous one, while in this case the direction is constantly decreasing almost at the same rate 

In [None]:
plt.plot(K, inertia)
plt.xlabel('Number of clusters')
plt.ylabel('Total sum of squares')
plt.title('Elbow method')

x_ticks = np.linspace(min(K), max(K), 7)

# Set the ticks on the x-axis
plt.xticks(x_ticks)

plt.show()

In [None]:
clusters = TimeSeriesKMeans(n_clusters=3, n_init=5, metric='dtw', random_state=0).fit_predict(X)

In [None]:
clusters

In [None]:
for df, cluster in zip(dataframes.values(), clusters):
    df['cluster'] = cluster

In [None]:
dataframes['bologna']

In [None]:
full_df = pd.concat(dataframes.values(), ignore_index=True)
full_df

In [None]:
# Set the style of seaborn for better aesthetics
sns.set(style="whitegrid")

for var in pollutants:
    
    # Plot the time series for each province
    plt.figure(figsize=(12, 6))
    sns.lineplot(x='date', y=var, hue='cluster', data=full_df)

    plt.title('Meteorological Information by cluster')
    plt.xlabel('Date')
    plt.ylabel(var)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

    plt.show()