In [1]:
import numpy as np
import pandas as pd
import graphviz
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# Formatting data

In [2]:
#Observations:  200 patients, 4 clinical variables:  
#age of patients(continuous), 
#HbA1C (continuous), 
#insuline taken (categorical, yes or not), 
#other anti-diabetic drugs are taken (categorical, yes or not)

data_diabetes = pd.read_table('data/patients_data.txt',sep='\t', header=None)


#Classes:  0 (Diabetes Remission) and 1 (Non-Remission) for 200 patients
classes_diabetes = pd.read_table('data/patients_classes.txt', header=None)

data_diabetes, classes_diabetes

(             0          1  2  3
 0    51.650893   9.679493  0  0
 1    64.627752   7.970522  0  1
 2    69.550752   6.368363  1  0
 3    65.270695   8.812844  0  1
 4    62.681176  10.322206  1  1
 ..         ...        ... .. ..
 195  45.200346   7.315751  0  1
 196  46.261624   6.520131  0  0
 197  41.424692   5.772244  0  1
 198  43.681741   7.993931  0  1
 199  62.970023   5.466263  1  0
 
 [200 rows x 4 columns],      0
 0    1
 1    1
 2    1
 3    1
 4    1
 ..  ..
 195  0
 196  0
 197  0
 198  0
 199  0
 
 [200 rows x 1 columns])

In [3]:
data_diabetes.isnull().values.any()

False

# Analysis

## Decision Tree

In [4]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(data_diabetes, classes_diabetes)

feature_names = ['age', 'hba1c', 'insuline taken', 'other drugs taken']
classes = ['DR','NDR']
dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=feature_names,
                                class_names=classes,
                                filled=True,
                                rounded=True,special_characters=True)

graph = graphviz.Source(dot_data)
graph.render("diabetes remission")

'diabetes remission.pdf'

## Random Forest

In [5]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(data_diabetes, classes_diabetes)

clf.predict(data_diabetes)

  


array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1])

In [6]:
clf.feature_importances_

array([0.57112592, 0.14837838, 0.18705705, 0.09343866])

## DiaRem

In [7]:
#formatting dataframe to diarem format

ages = data_diabetes[0]
HbA1C = data_diabetes[1]
insul = data_diabetes[2]
other = data_diabetes[3]

formated_data = []
for a, h, i, o in zip(ages, HbA1C, insul, other) : 
    formated_row = [None]*4
    
    if(a < 40):
        n_a = 0
    elif(a >= 40 and a < 50):
        n_a = 1
    elif(a >= 50 and a < 60):
        n_a = 2
    else:
        n_a = 3
    formated_row[0] = n_a
    
    if(h < 6.5):
        n_h = 0
    elif(h >= 6.5 and h < 7):
        n_h = 2
    elif(h >= 7 and h < 9):
        n_h = 4
    else:
        n_h = 6
    formated_row[1] = n_h
    
    if(i):
        n_i = 10
    else:
        n_i = 0
    formated_row[2] = n_i
        
    if(o):
        n_o = 3
    else:
        n_o = 0
    formated_row[3] = n_o
    
    formated_data.append(formated_row)

diarem_df = pd.DataFrame.from_records(formated_data)
diarem_df

Unnamed: 0,0,1,2,3
0,2,6,0,0
1,3,4,0,3
2,3,0,10,0
3,3,4,0,3
4,3,6,10,3
...,...,...,...,...
195,1,4,0,3
196,1,2,0,0
197,1,0,0,3
198,1,4,0,3


In [8]:
#classify dataframe by criteria: 
#  True(diabete remission) if sum(row) < 7

diarem_predict = []
for index, row in diarem_df.iterrows():
    if( sum(row) < 7 ):
        diarem_predict.append(0) #remission
    else:
        diarem_predict.append(1) #non-remission

diarem_predict = np.array(diarem_predict)
diarem_predict

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1])

# Comparison

In [9]:
#decision trees

data_diabetes["class"] = classes_diabetes[0].values.tolist()
data_diabetes = data_diabetes.sample(frac=1) #shuffle

chunks = []
for grp, df in data_diabetes.groupby(np.arange(len(data_diabetes)) // (len(data_diabetes)/5) ):
    chunks.append(df)

In [10]:
clf = tree.DecisionTreeClassifier()

scores = []
for i in range(len(chunks)):
    test_X = chunks[i].copy()
    test_y = test_X["class"]
    test_X.drop(columns=['class'], inplace=True)
    
    train_chunks = [chunks[c].copy() for c in range(len(chunks)) if(c != i)]
    train_X = pd.concat(train_chunks)
    
    train_y = train_X["class"]
    train_X.drop(columns=['class'], inplace = True)
    
    clf.fit(train_X, train_y)
    prediction = clf.predict(test_X)
    
    score = sum( [1 if(y == y_est) else 0 for y,y_est in zip(test_y, prediction)] ) / len(test_y)
    scores.append(score)
    
np.array(scores).mean()

0.6699999999999999

In [11]:
true_labels = np.array(classes_diabetes[0])

In [12]:
#random forest

clf = RandomForestClassifier(max_depth=2, random_state=0)

#clf.fit(data_diabetes, classes_diabetes)
#prediction = clf.predict(data_diabetes)
#score = sum( [1 if(y == y_est) else 0 for y,y_est in zip(np.array(true_labels), prediction)] ) / len(true_labels)

scores = []
for i in range(len(chunks)):
    test_X = chunks[i].copy()
    test_y = test_X["class"]
    test_X.drop(columns=['class'], inplace=True)
    
    train_chunks = [chunks[c].copy() for c in range(len(chunks)) if(c != i)]
    train_X = pd.concat(train_chunks)
    
    train_y = train_X["class"]
    train_X.drop(columns=['class'], inplace = True)
    
    clf.fit(train_X, train_y)
    prediction = clf.predict(test_X)
    
    score = sum( [1 if(y == y_est) else 0 for y,y_est in zip(test_y, prediction)] ) / len(test_y)
    scores.append(score)
    
np.array(scores).mean()



0.7150000000000001

In [13]:
#diarem

score = sum( [1 if(y == y_est) else 0 for y,y_est in zip(np.array(true_labels), diarem_predict)] ) / len(true_labels)
score

0.725

Nous avons un résultat pour **decision tree** tournant autour de 67% de précision.  
**random forest** et **Diarem** sont plus précis autour de 72%.  

Les méthodes machines learning peuvent égaler l'expertise dans le domaine.  
De ce fait les méthodes de machine learning seront très utiles pour des problèmes auquels nous n'avons pas encore de solution précise.  