In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
import math
import os

In [None]:
# df_china = pd.read_csv('output/merged_china.csv')
# df_us = pd.read_csv('output/merged_us.csv')
# df_india = pd.read_csv('output/merged_india.csv')

# df_raw_owid = pd.read_csv('dataset/owid/owid-co2-data.csv')
# df_raw_ghg = pd.read_csv('dataset/owid/ghg-emissions-by-sector.csv')
# df_raw_worldbank = pd.read_csv('dataset/worldbank/API.csv')
# df_worldbank_meta_country = pd.read_csv('dataset/worldbank/Metadata_Country_API_19_DS2_en_csv_v2_3159902.csv')


df_worldbank_imputed = pd.read_csv('output/dataset_worldbank_imputed.csv')

## Overall process

### EDA
EDA for imputed dataset

### Preprocessing (again)
1. Identify null value
2. Overlapping columns (or indicators)
2. Imputation 
    - woid (imputation drop columns)
    - worldbank (imputation
3. Data transformation

reference: [5 stages of data prep for k-means](https://medium.com/@evgen.ryzhkov/5-stages-of-data-preprocessing-for-k-means-clustering-b755426f9932)


### K-Means
- Euclidean K Means
- DBA K Means
- Soft DTW K Means

## EDA

In [None]:
df_worldbank_imputed

In [None]:
df_worldbank_imputed.describe()

## Clustering

### Preprocessing

Missing value is being handled by imputation. This preprocessing is to pivot (or transpose) data in column-year format (columns of years), in order to fit into tslearn's kmeans clustering.

In [None]:
# seperate indicator columns and country year columns
df_worldbank_normalized = df_worldbank_imputed.iloc[:, 2:]
df_country_and_year = df_worldbank_imputed.iloc[:, :2]
  
# apply normalization techniques
for column in df_worldbank_normalized.columns:
    # max scaled normalization
    minValue = df_worldbank_normalized.min()
    maxValue = df_worldbank_normalized.max()
    df_worldbank_normalized=(df_worldbank_normalized-minValue)/(maxValue-minValue)
      
# merge back country year with normalized data
df_worldbank_normalized = pd.concat([df_worldbank_normalized, df_country_and_year], axis=1)

# rearrange columns order
col = df_worldbank_normalized.columns.tolist()
new_col = col[-2:] + col[:-2]
df_worldbank_normalized = df_worldbank_normalized[new_col]
df_worldbank_normalized

In [None]:
# function to slice out country from the df
# and transpose the data to become column-year format
def get_pivot_data_column_year(df, country_name):
    
    # slice out targeted country and store it inside df_country
    df_country = df[df['Country Name'] == f'{country_name}'].copy()
    df_country.reset_index(inplace=True, drop=True) 

    # get ready to transpose
    df_country = df_country.iloc[:, 1:] # remove 'Country Name'
    df_country.Year = df_country.Year.astype('str') # convert 'year' to string type
    df_country = df_country.transpose()
    df_country.reset_index(inplace=True) 

    # reset first row as column
    new_header = df_country.iloc[0] # grab the first row for the header
    df_country = df_country[1:] # take the data but not header
    df_country.columns = new_header # set the header row as the df header
    df_country.rename(columns={'Year': 'Indicator Name'}, inplace=True) # rename the column column
    df_country.reset_index(inplace=True, drop=True) 

    # adding new columns
    df_country['Country Name'] = f'{country_name}'

    # rearrange columns
    col = df_country.columns.tolist()
    new_col = col[-1:] + col[:-1]
    
    return df_country[new_col]

In [None]:
# define countries, years and columns
countries = df_worldbank_imputed['Country Name'].unique().tolist()
years = df_worldbank_imputed['Year'].unique()
years = years.tolist()
years = [str(year) for year in years]
columns = ['Country Name', 'Indicator Name'] + years

# loop the entire dataset and transpose all countries
df_worldbank_transposed = pd.DataFrame([], columns=columns)
for country in countries:
    df_temp = get_pivot_data_column_year(df_worldbank_normalized, country) 
    df_worldbank_transposed = pd.concat([df_worldbank_transposed, df_temp], axis=0, ignore_index=True)
    
# final result
df_worldbank_transposed

In [None]:
df_worldbank_transposed['Indicator Name'].unique()

In [None]:
# since the clustering is not multivariate clustering
# so have to choose an variable
indicatorName = 'CO2 intensity (kg per kg of oil equivalent energy use)'
columnToDrop = ['Indicator Name']

df_train = df_worldbank_transposed[df_worldbank_transposed['Indicator Name']== indicatorName]
df_train = df_train.drop(columns=columnToDrop)
df_train.set_index('Country Name', inplace=True)
df_train

In [None]:
# convert training set into another format for tslearn

X_train_co2_intensity = to_time_series_dataset(df_train.copy())

### Time Series K Means

In [None]:
from tslearn.clustering import TimeSeriesKMeans
from tslearn.utils import to_time_series_dataset

# Matplotlib customization
%matplotlib inline
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['font.size'] = 14
mpl.rcParams['figure.dpi'] = 150.
mpl.rcParams["figure.figsize"] = (20,50) #change figure size, (x, y)

In [None]:
seed = 1
np.random.seed(seed)

In [None]:
# Set number of cluster

cluster_number = 10

In [None]:
def euclideanKMeans(cluster, seed, X_train):
    print("Euclidean k-means")
    km = TimeSeriesKMeans(n_clusters=cluster, 
                          verbose=True, 
                          random_state=seed, 
                          max_iter=10)
    y_pred = km.fit_predict(X_train)
#     clusters = pd.Series(data=y_pred, index=X_train.index)
#     clusters

    plt.figure()
    for yi in range(cluster):
        plt.subplot(cluster, 1, yi+1)
        for xx in X_train[y_pred == yi]:
            plt.plot(xx.ravel(), "k-", alpha=.2)
        plt.plot(km.cluster_centers_[yi].ravel(), "r-")
        plt.ylim(0, 1)
        plt.text(0.01, 0.50,'Cluster %d' % (yi + 1),
                 transform=plt.gca().transAxes)

    print("Euclidean k-means Chart")
    plt.show()
    return y_pred

In [None]:
# DBA-k-means
def dbaKMeans(cluster, seed, X_train):
    print("DBA k-means")
    dba_km = TimeSeriesKMeans(n_clusters=cluster,
                              n_init=2,
                              metric="dtw",
                              verbose=True,
                              max_iter_barycenter=10,
                              random_state=seed)
    y_pred = dba_km.fit_predict(X_train)

    for yi in range(cluster):
        plt.subplot(cluster, 1, yi+1)
        for xx in X_train[y_pred == yi]:
            plt.plot(xx.ravel(), "k-", alpha=.2)
        plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
        plt.ylim(0, 1)
        plt.text(0.01, 0.50,'Cluster %d' % (yi + 1),
                 transform=plt.gca().transAxes)


    print("DBA k-means Chart")
    plt.show()
    return y_pred

In [None]:
# Soft-DTW-k-means
def softDTWKmean(cluster, seed, X_train):
    print("Soft-DTW k-means")
    sdtw_km = TimeSeriesKMeans(n_clusters=cluster,
                               metric="softdtw",
                               metric_params={"gamma": .01},
                               verbose=True,
                               random_state=seed)
    y_pred = sdtw_km.fit_predict(X_train)

    for yi in range(cluster):
        plt.subplot(cluster, 1, yi+1)
        for xx in X_train[y_pred == yi]:
            plt.plot(xx.ravel(), "k-", alpha=.2)
        plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-")
        plt.ylim(0, 1)
        plt.text(0.01, 0.50,'Cluster %d' % (yi),
                 transform=plt.gca().transAxes)

    print("Soft-DTW k-means Chart")
    plt.show()
    return y_pred

In [None]:
def mergeClusterNames(y_pred, df_index):
    clusters = pd.Series(data=y_pred, index=df_index.index)
    df_cluster = clusters.to_frame()
    df_cluster.columns = ['cluster']
    return df_cluster

# use this get cluster instead of directly filter it. 
# Because the graph displayed need to be -1 to get correct cluster name 
def getSingleCluster(df_cluster, n):
    # cluster 1 in the chart represent cluster 0 in the data.
    return df_cluster[df_cluster['cluster'] == n-1]

In [None]:
mpl.rcParams["figure.figsize"] = (15,30) #change figure size, (x, y)
y_pred_X_euclideanKM_co2_intensity = euclideanKMeans(3, seed, X_train_co2_intensity)

In [None]:
y_pred_X_euclideanKM_co2_intensity

In [None]:
# get all cluster name
cluster_result_euclideanKM_co2_intensity = mergeClusterNames(y_pred_X_euclideanKM_co2_intensity, df_train)
cluster_result_euclideanKM_co2_intensity

In [None]:
cluster_result_euclideanKM_co2_intensity_2 = getSingleCluster(cluster_result_euclideanKM_co2_intensity, 2)
cluster_result_euclideanKM_co2_intensity_3 = getSingleCluster(cluster_result_euclideanKM_co2_intensity, 3)

In [None]:
cluster_name_co2_intensity_2 = cluster_result_euclideanKM_co2_intensity_2.index.tolist()

df_viz = df_train.reset_index()
df_viz = df_viz[df_viz['Country Name'].isin(cluster_name_co2_intensity_2)]
df_viz

In [None]:
# visualization