In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.cluster import DBSCAN
import collections

In [None]:
data=pd.read_csv('data/wb_dataset.txt', sep='\t',header=0)
data

Mortalité par cancer en fonction des dépenses en santé du pays, des dépenses directes de chaque citoyen en santé, du PIB et de l'indice de capital humain (World Bank Indice)

Attribute selection
===================

* Remove attributes containing many missing values.

* Remove attributes that seem redundant because of strong correlations.

* Used derived attributes to exhibit variables that could be more pertinent (relative value, ratio, difference, relative variation, indicator combining several attributes, ...).

* Choose a subset of the dimensions to focus on some aspects and/or to simplify the interpretations. For methods that use distances, reduce the number of dimensions between 4 and 6, and standardize the data if necessary.

Remark: It is possible to complete the dataset with other sources.

In [None]:
#Renaming attribute for easier use
# Time = removed
# Time Code = removed
# Country Name = country_name
# Country Code = country_code
# Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70 (%) = mortality
# Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS] = health_cost_percent
# Current health expenditure per capita (current US$) [SH.XPD.CHEX.PC.CD] = health_cost_capita_percent
# Out-of-pocket expenditure (% of current health expenditure) [SH.XPD.OOPC.CH.ZS] = health_oop_cost
# GDP (current US$) [NY.GDP.MKTP.CD] = gdp
# GDP per capita (current US$) [NY.GDP.PCAP.CD] = gdp_capita
# Human capital index (HCI) (scale 0-1) [HD.HCI.OVRL] hci

In [None]:
#Drop the 2 first column (time and time code)
data.drop(["Time","Time Code"], axis=1, inplace=True)
# Remove the last 5 lines that are now empty
data.drop(data.tail(5).index,inplace=True)

In [None]:
data.columns = ['country_name','country_code','mortality','health_cost_percent','health_cost_capita_percent','health_oop_cost','gdp','gdp_capita','hci']
data

In [None]:
hcp=pd.to_numeric(data[data["health_cost_percent"]!=".."]["health_cost_percent"],downcast='float')
hccp=pd.to_numeric(data[data["health_cost_capita_percent"]!=".."]["health_cost_capita_percent"],downcast='float')
np.corrcoef(hcp,hccp) #il semble y avoir une corrélation entre health_cost_percent et health_cost_capita_percent
#On choisit donc de supprimer la variable health_cost_capita_percent      ###Si tu confirmes mdrr

In [None]:
gdp = pd.to_numeric(data[data["gdp"]!=".."]["gdp"],downcast='float')
gdpc = pd.to_numeric(data[data["gdp_capita"]!=".."]["gdp_capita"],downcast='float')
np.corrcoef(gdp,gdpc) #il ne semble pas y avoir une corrélation très importante entre gdp et gdp_capita

Object selection
================

* Remove objects containing missing values (except if using methods that handle clearly the missing values).

* Identified the outliers (exceptional objects, noise, ...) in 1D, 2D, n-dimensions. Keep track of them and eventually remove them.


In [None]:
#We remove the region object because they are already agregate of data which we are not interested into
count_removed_region = len(data) - data.index[data['country_code'] == "ZWE"][0] - 1
if count_removed_region > 0 :
    data.drop(data.tail(count_removed_region).index,inplace=True)
    
print(count_removed_region)

In [None]:
#We remove the object where we have no data about cancers mortality
count_removed_nodata = len(data.loc[data['mortality'] == ".."])
if (count_removed_nodata > 0):
    data.drop(data.index[data['mortality'] == ".."],inplace=True)

print(count_removed_nodata)

In [None]:
#We choose to remove the object where at least one of the selected attributes are missing
missing_attributes = pd.DataFrame(data.apply(pd.Series.value_counts, axis=1)[".."])
missing_attributes_index = missing_attributes.index[missing_attributes[".."] > 0]
if (len(missing_attributes_index)>0):
    data.drop(missing_attributes_index,inplace=True)
print(len(missing_attributes_index))
data

In [None]:
#pd.to_numeric(data[data["mortality"] != ".."][nomColonne], downcast='float')

In [None]:
data[['mortality','health_cost_percent','health_cost_capita_percent','health_oop_cost','gdp','gdp_capita','hci']] = data[['mortality','health_cost_percent','health_cost_capita_percent','health_oop_cost','gdp','gdp_capita','hci']].apply(pd.to_numeric)

In [None]:
#Looking to the outliers
classes=pd.cut(data["mortality"],5, labels=["low","medium", "average", "high","very_high"])
full_clean_dataset["mortality_class"] = classes
outliers_index = full_clean_dataset.index[full_clean_dataset["mortality_class"]=="very_high"]
data.drop(outliers_index,inplace=True)
outliers_index = full_clean_dataset.index[full_clean_dataset["mortality_class"]=="high"]
data.drop(outliers_index,inplace=True)

In [None]:
full_clean_dataset = data.copy()


Clustering
==========

* Look for clusters of globular shapes and of arbitrary shapes, using in particular K-means, hierarchical complete/single and DBSCAN.

* Compute dendrograms for hierarchical clustering.

* Determine good candidates for the number of clusters (using SSE, silhouette coefficient and grouping distance curves).

* Study the stability of the K-means convergence.

* Compare (using entropy or mutual entropy, and contingency tables) the content of the clusters to a known labelling or to the result of another clustering.

* Describe the envelope (the borders) of the clusters using a decision tree (on a dataset having at least 4 dimensions).


Remarks:

- Removing outliers can improve the stability and the dispersion.

- Clustering evaluation can be made by comparing SSE and silhouette coefficient obtain of the data to their values on random dataset or on partially randomize data.

#### Hierachical clustering

In [None]:
#Let's define X and Y for Hierachical Clustering
#Y=pd.cut(full_clean_dataset["mortality"],5, labels=["low","medium", "average", "high","very_high"])
X=full_clean_dataset[['health_cost_percent','health_cost_capita_percent','health_oop_cost','gdp','gdp_capita','hci']]

In [None]:
Z = sch.linkage(X, method='complete', metric='euclidean',optimal_ordering=True)
#Draw the dendrogram:
fig = plt.figure(figsize=(20, 40))
dendro = sch.dendrogram(Z, orientation='left', leaf_rotation=0, leaf_font_size=15,labels=list(full_clean_dataset['country_name']))

In [None]:
fig = plt.figure(figsize=(10, 7))
plt.plot(Z[:,2],'o-')
plt.grid(axis='y')

We used complete hierachical clustering. By looking at the curving distance curves, it seems that 6 groups can be highlighted. We'll test this method with 6 clusters.

#### DBScan

In [None]:
objects=full_clean_dataset[['health_cost_percent','health_cost_capita_percent','health_oop_cost','gdp','gdp_capita','hci']]
#We must scaled and center data in order to use efficiently dbscan
objects = (objects - objects.mean()) / (objects.max() - objects.min())
unclassified = []
nb_cluster = []
for i in range(0,100):
    dbscan = DBSCAN(eps=(0.10 + (i*0.001)), min_samples=2)
    dbscan.fit(objects)
    unclassified.append(np.count_nonzero(dbscan.labels_ == -1))
    nb_cluster.append(max(dbscan.labels_))
plt.plot(list(range(0,100)),unclassified)
plt.plot(list(range(0,100)),nb_cluster)
plt.axhline(y = len(objects)*0.2, color = 'r', linestyle = '-',label="20% of value is unclassified")

If we want at least 80% of our value to be classified, we must choose eps ~>0.136, we choose eps = 0.145

In [None]:
dbscan = DBSCAN(eps=0.145, min_samples=2)
dbscan.fit(objects)
dbscan.labels_

In [None]:
cluster = dbscan.labels_
objects['cluster']= cluster
no_noise_objects = objects[objects['cluster']!=-1]
sns.pairplot(data=no_noise_objects,hue='cluster')

DBScan seems to have difficulty to differenciate clusters. It seems that there is a lot of noise in his prediction. It can maybe be caused by scaling and centering data 

#### K-means

In [None]:
# Get SSE when varying the number of clusters
objects=full_clean_dataset[['health_cost_percent','health_cost_capita_percent','health_oop_cost','gdp','gdp_capita','hci']]
sse_list = []
for i in range(2,11):
    km_i_clusters=KMeans(n_clusters=i)
    km_i_clusters.fit(objects)
    sse_list = sse_list + [km_i_clusters.inertia_]
fig = plt.figure(figsize=(7, 4))
plt.plot(k_list,sse_list,'bo--')
plt.grid()
plt.xlabel("nb of clusters",fontsize=14)
plt.ylabel("SSE",fontsize=14)
plt.plot()

By comparing SSE value, 4 and 5 clusters seems to be a nice choice. We decide to choose 5 clusters.

In order to know if our data is noisy or if the clusters are easily drawn, we study the stability of the K-means convergence

In [None]:
def compute_stability(km,df,iterations=100):
    avg_silhouette_coef = []
    sse_list = []
    for i in range(100):
        km.fit(df)
        labels = km.predict(df)
        avg_silhouette_coef.append(silhouette_score(df, labels,metric='euclidean'))
    avg_silhouette_coef = np.asarray(avg_silhouette_coef)
    return(avg_silhouette_coef.std())

In [None]:
km = KMeans(n_clusters=5,init='random',n_init=1) # create a KMeans object
compute_stability(km,objects)

Variance is very low, whatever the initial point choosen by Kmeans, the convergence clusters seems to be the same

In [None]:
km=KMeans(n_clusters=5)
result=km.fit(objects)

#### Method Comparision

1) Hierachical Clustering Score - 6 clusters

In [None]:
clusters = sch.fcluster(Z, 6, criterion='maxclust')
classes=pd.cut(full_clean_dataset["mortality"],6, labels=["low","medium", "average", "high","very_high","very_very_high"])
crosstab=pd.crosstab(clusters,classes)
sns.heatmap(crosstab, annot=True)

2) DBScan - 5 clusters

In [None]:
cluster = dbscan.labels_
classes=pd.cut(full_clean_dataset["mortality"],5, labels=["low","medium", "average", "high","very_high"])
crosstab=pd.crosstab(cluster,classes)
sns.heatmap(crosstab, annot=True)

3) Kmeans - 5 clusters

In [None]:
clusters=result.labels_
classes=pd.cut(full_clean_dataset["mortality"],5, labels=["low","medium", "average", "high","very_high"])
crosstab=pd.crosstab(clusters,classes)
sns.heatmap(crosstab, annot=True)

In [None]:
objects = objects.copy()
objects['cluster']= clusters
sns.pairplot(data=objects,hue='cluster')

In [None]:
objects[objects['cluster']==3]

We can't find real relation between the different country by using Clustering on our dataset.

We can only spot 3 tendencies

* A first group of country look globally similar. It contains Australia, France, Italy, Korea, Canada for instance. (cluster 2)

* A second group of country which contains only Germany and Japan which seems to be really cost-efficient in decreasing mortality

* Two specific country considered as alone in their cluster because of their specificity : China and USA 
    * China has an very high GPD but an average health system
    * USA has a very expensive health system but not cost efficient at all

Classification
==============

* Construct a label by discretisation of an attribute (this label can be built by clustering the values of this attribute). Use this label as class label.

* Compare the results obtained using decision trees and the K nearest neighbors.

* Evaluate the quality of the model using cross validation. Report the score for each subset and the global score.

* Modify the learning parameters to detect of possible overfitting.


Remarks:

- A contingency table can be use to analyse the errors by class.

- Removing outliers can reduce error.

- A classification model can be use to predict labels of a targeted attribute for objects where this attribute value is missing.


In [None]:
from sklearn.model_selection import train_test_split

+ split the dataset into a train and test set

In [None]:
X=data.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

+ create a Decision Tree instance

In [None]:
dt = DecisionTreeClassifier()

Report content and format
=========================

* Keep track of the choices made and justify them.

* Give commands and parameters so that the results can be reproduced.

* Show in tables some views of subsets of the data, but not the complete view of all objects in the data. Give the attributes and their units, as well as the number of objects used.

* Give results (values, graphics,...).

* Try to interpret the results in the domain.


Remarks:

- It is possible to study several subsets of attributes (e.g., one for the clustering part, another for the classification tasks).

- The report can suggest directions for future works, e.g., directions that have not been explored due to time constraints.

- The document can be split in two reports: one for the clustering and one for the classification tasks.


How to edit the report
======================

A way to prepare the report is to add "Markdows" cells in the Jupyter notebook to insert text, and use #, ##, ###, #### for headings (section, subsection, subsubsection, paragraph). Next, to get a latex version of the notebook use: "File -> Download as -> LaTeX" (this requires Pandoc to be installed https://pandoc.org/installing.html). And then edit the .tex if needed (to add a title page, to clean some parts, ...), before compiling it.

IMPORTANT 1: the end of long lines in cells containing python commands can be suppressed (due to the latex verbatim mode), to avoid this use lines of at most 80 characters and use "\" to continue the command on the next line.

IMPORTANT 2: if using "File -> print preview" to generate a pdf of the notebook, then the sections will not be numbered, and check also that there is no missing part in long lines.

IMPORTANT 3: "File -> Download as -> LaTeX" may not work for the graphical representation of decision trees, depending on the version of sklearn and on external installed softwares. A workaround is to generate the pdf of the tree and then to include the pdf by a latex command.
Example, with a cell containing the code:
graph = graphviz.Source(dot_data)
graph # to display the tree

replace the cell content by:
graph = graphviz.Source(dot_data)
graph.render("out_iris_decision_tree") # to generate a pdf file

then just below this cell add a Markdown cell containing the following tree lines:
\begin{center}
\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{out_iris_decision_tree.pdf}
\end{center}

then generate the latex file with "File -> Download as -> LaTeX", put the file out_iris_decision_tree.pdf in the folder of the latex file, and compile the latex file.

---

Zip file to be send by mail
===========================
(mail to Christophe.Rigotti@insa-lyon.fr and Sergio.Peignier@insa-lyon.fr)

Prepare a single folder with name the names of the authors (in lexicographic order): NAME1_NAME2. Zip this folder and send the file NAME1_NAME2.zip

The folder must contains:

- zip file(s) of the data file(s), (txt format to reproduce the work). 

- file(s), (txt format or pdf) containing the definitions of the variables given by the data provider.

- the Jupyter notebook(s) (format .ipynb to reproduce the work).

- the report in one or two pdf files.

