In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn_extra.cluster import KMedoids
from scipy.spatial import distance
import operator

pd.set_option('display.max_columns', None)
dc = pd.read_csv('./merged.csv') #dc stands for developing countries
korea_origin = pd.read_csv('./Clean_Korea.csv')

dc_col = list(dc.columns)
country_dict = dict(zip(list(dc['Country_Code']),list(dc['Country_Name'])))
sc = list(dc['Series_Code'].unique()) ##series code
cc = list(dc['Country_Code'].unique()) ##country code

##global controller
model_num = 3    ## 1: kmeans, 2: kmedoids, 3: heirarchical clustering
ser_included = ['NV.IND.TOTL.ZS','NV.AGR.TOTL.ZS','NV.IND.MANF.ZS','NV.SRV.TOTL.ZS'] #I,A,M,S
df_colnames = ['Industry','Aggriculture','Manufacturing','Services']
num_series = len(ser_included)
start_year = str(korea_origin['Year'][0])
current_year = '2020'
num_years = 5
k = 4
labels = []
for num in range(k):
    labels.append(f'step{num+1}')
rs = 10
#########################################################################
##Trimming datasets
## dc: dataframe concerning developing countries
idx_sc = dc.columns.get_loc('Series_Code')
idx_start = dc.columns.get_loc(start_year)
idx_curr = dc.columns.get_loc(current_year)
drop_years = dc.iloc[:,idx_sc+1:idx_start]
dc.drop(dc.iloc[:,idx_curr+1:], axis=1, inplace=True)
dc.drop(drop_years, axis=1, inplace=True)
## korea: dataframe from korea_origin for clustering
korea = korea_origin.drop(columns = ['Year','GDP'])

In [None]:
def build_model(model_num):
    if model_num == 1:
        ## kmeans
        model = KMeans(n_clusters = k, random_state = rs)
    elif model_num == 2:
        ##kmedoids
        model = KMedoids(n_clusters = k, random_state = rs)
    elif model_num == 3:
        ##hcluster
        model = AgglomerativeClustering(n_clusters = k, linkage = 'complete')
    else:
        print("-------parameter "'model_num'" error-------")
    return model

def get_centers(model_num):
    if model_num == 3:
        temp = []
        for a in set(model.labels_):
            x = korea[korea['cluster']==a].mean(axis=0)
            temp.append(list(x[:-1]))
        cluster_centers = np.array(temp)
    else:
        cluster_centers = model.cluster_centers_
    return cluster_centers

model = build_model(model_num)
model.fit(korea)
centers = get_centers(model_num)
korea.insert(num_series, "cluster", model.predict(korea))
## labeling cluster name
origin_labels = list(korea["cluster"].unique())
korea["cluster"].replace(origin_labels, labels, inplace = True)

In [None]:
dc_clusters = pd.DataFrame(columns = df_colnames)

for ctr in cc:
    ctr_org = dc[dc['Country_Code'] == ctr]
    GDPidx = ctr_org[ctr_org['Series_Code'] == 'NY.GDP.MKTP.CD'].index
    ctr_org.drop(GDPidx, inplace=True)
    idx = ctr_org.columns.get_loc(current_year)
    ser_idx = ctr_org.columns.get_loc('Series_Code')
    ctr_tmp = ctr_org.iloc[:,(idx-num_years+1):(idx+1)]
    ctr_tmp.insert(0,'Series_Code', ctr_org['Series_Code'])
    if ctr_tmp.isnull().sum().sum() > 0:
        country_dict.pop(ctr)
        continue
    avg_value = ctr_tmp.iloc[:,1:].mean(axis='columns')
    ctr_tmp.insert((num_years+1), 'avg', avg_value)
    dc_clusters = dc_clusters.append(pd.Series(name = ctr))
    ctr_tmp.set_index('Series_Code', inplace=True)
    for ser,col in zip(ser_included, df_colnames):
        dc_clusters.at[ctr, col] = ctr_tmp.loc[ser,'avg']
        
def alloc_clu(arr, centers):
    dist_dict = {}
    i = 0
    for center in centers:
        dist_dict[i] = distance.euclidean(arr, center)
        i+=1
    min_dist = min(dist_dict, key = dist_dict.get)
    return min_dist
        
values = dc_clusters.values
for i in range(len(dc_clusters)):
    cluster = alloc_clu(values[i], centers)
    dc_clusters.loc[dc_clusters.index[i],'cluster'] = int(cluster)
## labeling cluster name
dc_clusters['cluster'].replace(origin_label, labels, inplace = True)

def get_key(val):
    for key, value in country_dict.items():
         if val == value:
                return key

def classify_step(country_name):
    country_code = get_key(country_name)
    if country_code in country_dict:
        answer = dc_clusters.loc[country_code]['cluster']
    else:
        answer = "Please check your input."
    return answer


In [None]:
#Execution
classify_step(input('Which country are you curious about?  '))