In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os

In [None]:
# df_china = pd.read_csv('output/merged_china.csv')
# df_us = pd.read_csv('output/merged_us.csv')
# df_india = pd.read_csv('output/merged_india.csv')

df_raw_owid = pd.read_csv('dataset/owid/owid-co2-data.csv')
df_raw_ghg = pd.read_csv('dataset/owid/ghg-emissions-by-sector.csv')
df_raw_worldbank = pd.read_csv('dataset/worldbank/API.csv')
df_worldbank_meta_country = pd.read_csv('dataset/worldbank/Metadata_Country_API_19_DS2_en_csv_v2_3159902.csv')

## Overall process

### Preprocessing (again)
1. Identify null value
2. Overlapping columns (or indicators)
2. Imputation 
    - woid (imputation drop columns)
    - worldbank (imputation
3. Data transformation

reference: [5 stages of data prep for k-means](https://medium.com/@evgen.ryzhkov/5-stages-of-data-preprocessing-for-k-means-clustering-b755426f9932)

### EDA
- Only looking at a few attributes

#### For OWID
- test

#### For Worldbank
- Test


### K-Means
- Euclidean K Means
- DBA K Means
- Soft DTW K Means

## EDA

In [None]:
df_worldbank = df_raw_worldbank.drop(columns=['Country Code', 'Indicator Code','2017','2018','2019','2020'])
df_co2 = (df_worldbank[df_worldbank['Indicator Name'] == 'CO2 emissions from gaseous fuel consumption (% of total)']).drop(columns=['Indicator Name'])
df_co2.set_index('Country Name', inplace=True)
df_co2 = df_co2.fillna(0)
df_co2

In [None]:
# copy the data
df_co2_normalized = df_co2.copy()
  
# apply normalization techniques
for column in df_co2_normalized.columns:
    # max scaled normalization
    df_co2_normalized[column] = df_co2_normalized[column]  / 100
      
# view normalized data
df_co2_normalized.tail()

## Preprocessing

### Clustering

In [None]:
from tslearn.clustering import TimeSeriesKMeans
from tslearn.utils import to_time_series_dataset

# Matplotlib customization
%matplotlib inline
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['font.size'] = 14
mpl.rcParams['figure.dpi'] = 150.
mpl.rcParams["figure.figsize"] = (20,50)

In [None]:
seed = 1
np.random.seed(seed)

In [None]:
# Set number of cluster

cluster_number = 10

In [None]:
# training set (there's no testing set)

X_train_co2 = to_time_series_dataset(df_co2_normalized.copy())

In [None]:
def euclideanKMeans(cluster, seed, X_train):
    print("Euclidean k-means")
    km = TimeSeriesKMeans(n_clusters=cluster, 
                          verbose=True, 
                          random_state=seed, 
                          max_iter=10)
    y_pred = km.fit_predict(X_train)
#     clusters = pd.Series(data=y_pred, index=X_train.index)
#     clusters

    plt.figure()
    for yi in range(cluster):
        plt.subplot(cluster, 1, yi+1)
        for xx in X_train[y_pred == yi]:
            plt.plot(xx.ravel(), "k-", alpha=.2)
        plt.plot(km.cluster_centers_[yi].ravel(), "r-")
        plt.ylim(0, 1)
        plt.text(0.01, 0.50,'Cluster %d' % (yi + 1),
                 transform=plt.gca().transAxes)

    print("Euclidean k-means Chart")
    plt.show()
    return y_pred

In [None]:
# DBA-k-means
def dbaKMeans(cluster, seed, X_train):
    print("DBA k-means")
    dba_km = TimeSeriesKMeans(n_clusters=cluster,
                              n_init=2,
                              metric="dtw",
                              verbose=True,
                              max_iter_barycenter=10,
                              random_state=seed)
    y_pred = dba_km.fit_predict(X_train)

    for yi in range(cluster):
        plt.subplot(cluster, 1, yi+1)
        for xx in X_train[y_pred == yi]:
            plt.plot(xx.ravel(), "k-", alpha=.2)
        plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
        plt.ylim(0, 1)
        plt.text(0.01, 0.50,'Cluster %d' % (yi + 1),
                 transform=plt.gca().transAxes)


    print("DBA k-means Chart")
    plt.show()
    return y_pred

In [None]:
# Soft-DTW-k-means
def softDTWKmean(cluster, seed, X_train):
    print("Soft-DTW k-means")
    sdtw_km = TimeSeriesKMeans(n_clusters=cluster,
                               metric="softdtw",
                               metric_params={"gamma": .01},
                               verbose=True,
                               random_state=seed)
    y_pred = sdtw_km.fit_predict(X_train)

    for yi in range(cluster):
        plt.subplot(cluster, 1, yi+1)
        for xx in X_train[y_pred == yi]:
            plt.plot(xx.ravel(), "k-", alpha=.2)
        plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-")
        plt.ylim(0, 1)
        plt.text(0.01, 0.50,'Cluster %d' % (yi),
                 transform=plt.gca().transAxes)

    print("Soft-DTW k-means Chart")
    plt.show()
    return y_pred

In [None]:
def mergeClusterNames(y_pred, df_index):
    clusters = pd.Series(data=y_pred, index=df_index.index)
    df_cluster = clusters.to_frame()
    df_cluster.columns = ['cluster']
    return df_cluster

def getSingleCluster(df_cluster, n):
    # cluster 1 in the chart represent cluster 0 in the data.
    display(df_cluster[df_cluster['cluster'] == n-1])

In [None]:
y_pred_X_china_euclideanKM = euclideanKMeans(15, seed, X_train_co2)

In [None]:
cluster_china_euclideanKM = mergeClusterNames(y_pred_X_china_euclideanKM, df_china_normalized_tranposed)
getSingleCluster(cluster_china_euclideanKM, 2)