## Missing Value Clustering

This notebook aims to identify the patterns of missing value and cluster them together.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline
from tslearn.clustering import TimeSeriesKMeans
import pickle

mpl.rcParams.update(mpl.rcParamsDefault)
%matplotlib inline
mpl.rcParams['font.size'] = 14
mpl.rcParams['figure.dpi'] = 90.

In [None]:
df = pd.read_csv('dataset/worldbank/API.csv')
meta_country = pd.read_csv('dataset/worldbank/Metadata_Country_API_19_DS2_en_csv_v2_3159902.csv')
meta_indicator = pd.read_csv('dataset/worldbank/Metadata_Indicator_API_19_DS2_en_csv_v2_3159902.csv')

## Preprocessing Data

Missing value: fill null value with 0, and fill not null value with 1 (only apply this method on year dimension)

In [None]:
# Get  columns whose data type is float
floatColumns = df.dtypes[df.dtypes == np.float]

# list of columns whose data type is float
listOfFloatColumnNames = list(floatColumns.index)

print(listOfFloatColumnNames)

In [None]:
# Get  columns whose data type is object
objectColumns = df.dtypes[df.dtypes == np.object]

# list of columns whose data type is object
listOfObjectColumnNames = list(objectColumns.index)

print(listOfObjectColumnNames)

In [None]:
df_years = df[listOfFloatColumnNames]
df_years.tail(3)

In [None]:
# fill nan with 0, and fill not nan with 1

df_years = df_years.fillna(0)
df_years[df_years[listOfFloatColumnNames] > 0] = 1
df_years.tail(3)

In [None]:
df_countryAndIndicator = df[listOfObjectColumnNames]
df_countryAndIndicator.tail(3)

In [None]:
# concat df_countryAndIndicator and df_years back together

missingValue_df = pd.concat([df_countryAndIndicator, df_years], axis=1)

# final preprocessed data

missingValue_df

## Group by Indicator

There are a few ways of checking the pattern of missing value. This section start off with missing value group by indicator, which means all country's data will be grouped (sum) together in each of the 76 indicators.

In [None]:
# Drop first column of dataframe
df_groupBy_indicatorCode = missingValue_df.iloc[: , 3:]

# Then group by indicator
df_groupBy_indicatorCode = df_groupBy_indicatorCode.groupby(['Indicator Code']).sum()
df_groupBy_indicatorCode

In [None]:
# The first and second indicator has similar missing value pattern

df_0_2 = (df_groupBy_indicatorCode.iloc[0:2 , :]).transpose()

sns.lineplot(data=df_0_2, palette="tab10", linewidth=2.5)

In [None]:
# This is another missing value pattern found

df_3_8 = (df_groupBy_indicatorCode.iloc[3:8 , :]).transpose()

sns.lineplot(data=df_3_8, palette="tab10", linewidth=2.5)

In [None]:
# more of them

df_9_15 = (df_groupBy_indicatorCode.iloc[9:15 , :]).transpose()

sns.lineplot(data=df_9_15, palette="tab10", linewidth=2.5)

## Time series clustering based on indicator

Since there are 76 different indicators and all of them may have different pattern of missing value. This section aim to use time series clustering to group all missing data value pattern into their respective category.

In [None]:
from tslearn.utils import to_time_series_dataset

df_cluster = []
cluster = 3

X_train = df_groupBy_indicatorCode.iloc[0:10 , :].reindex()
X_train = X_train.set_index['Indicator Code']

colors = ['orange', 'turquoise', 'limegreen']
names = ['orange cluster','turquoise cluster','limegreen cluster']

seed = 1
np.random.seed(seed)


X_train = to_time_series_dataset(X_train.copy())
X_train

# print('Indicators vs time curves')

# km = TimeSeriesKMeans(n_clusters=cluster, verbose=True, random_state=seed, max_iter=10)
# y_pred = km.fit_predict(X_train)
# clusters = pd.Series(data=y_pred, index=df.index)
# clusters

# f, (ax1, ax2, ax3) = plt.subplots(cluster, sharex=True, sharey=True,figsize=(12,8))

# for yi,cl,xs in zip(range(cluster),[0,1,2],[ax1,ax2,ax3]):
#     data = df.rolling(7, axis=1, min_periods=1).mean().fillna(0).loc[clusters[clusters == cl].index]
#     data.T.plot(legend=False, alpha=.2,color='black', ax=xs)
#     data.mean(axis=0).plot(linewidth=3., color=colors[cl], ax=xs)
#     n = len(data)
#     print('{}, N = {}'.format(names[cl], n))
#     df_cluster.append(data) # save df to list

# ax1.spines['top'].set_visible(False)
# ax1.spines['right'].set_visible(False)
# ax2.spines['right'].set_visible(False)
# ax3.spines['right'].set_visible(False)
# ax4.spines['right'].set_visible(False)
# ax5.spines['right'].set_visible(False)

# print("[0:orange, 1:turquoise,2:limegreen]")
# f.subplots_adjust(hspace=0)
# #plt.ylim(-10, 220)
# plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)