In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
country_data = pd.read_csv('Country-data.csv') # loading the dataset

columns_= [col for col in country_data.columns]
columns_.remove('country')
# displaying a line plot of every feature, a part from the name of the countries

In [None]:
i=1
for col in columns_:
    plt.subplot(5,2,i)
    sns.lineplot(data=country_data[col])
    plt.grid()
    plt.xlabel('Countries')
    i+=1

In [None]:
# plotting the heatmap to check the correlation between features
heat_map = sns.heatmap(country_data.corr(), annot=True)

In [None]:
# k means clustering based on the correlation between child moratility and total fertility
# (I checked which features have the highest correlation based on the heatmap.)
# (The highest one was the correlation between income and gdpp; but when I plotted them, the clustering was not so clear.)
#(Therefore I tried the second highest correlation, which is the one between child moratlity and total fertility, and the result was very clear.)
sns.scatterplot(x=country_data['child_mort'], y=country_data['total_fer'])

In [None]:
child_fer = np.array(country_data[['child_mort','total_fer']]) 
child_fer = np.vstack(child_fer.astype(np.float32))

In [None]:
# I decided to set 3 as number of cluster, to underline the countries who need help from the ones that partially need help, and the ones that do not need help.
kmeans = KMeans(n_clusters=3, n_init=15)
kmeans.fit(child_fer)
clusters = kmeans.labels_

In [None]:
# Plotting the result
fig, ax = plt.subplots()
idxs = np.unique(kmeans.labels_)

for i in idxs:
    points = child_fer[kmeans.labels_==i,:]
    plt.scatter(points[:,0], points[:,1])
    plt.scatter(kmeans.cluster_centers_[i][0], kmeans.cluster_centers_[i][1], s=100, c='red')


ax.set_title('k-means based on the correlation between child mortality and total fertility')
ax.set_xlabel('child_mort')
ax.set_ylabel('total_fer')

In [None]:
# Next, I want to show the list of the countries which need help, based on the result of the clustering.

country_data['cluster'] = clusters
need_help = country_data[(country_data['cluster']==0)]
country_names = need_help['country']

print("Countries that need help, based on the clustering: ")
print(country_names)