In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder

from keras import layers
from keras import models

In [2]:
dateColumnNames = ['contact_date','Glycemie_der_date','HbA1c_der_date','der_date_poids','der_date_taille','first_contact_date']

dfView = pd.read_csv('PatientsHTA.zip',nrows=1)
df = pd.read_csv('PatientsHTA.zip',engine='c',parse_dates=dateColumnNames)

# Suppression des lignes trop peu nombreuses

Nous souhaitons faire un apprentissage en utilisant la dimension temporelle comme filtre pour le CNN. Pour ça il faut donc que nous ayons plusieurs entrée. Avant de commencer à traîter les données nous supprimons donc toutes les lignes qui n'ont pas plusieurs entrées de ```person_id```. Nous choisissons arbitrairement que pour être utile à l'apprentissage, il faut au moins 10 entrées dans cette colonne

In [3]:
valueCounts = df.person_id.value_counts()
dfEnought = df[df.person_id.isin(valueCounts[valueCounts.values >= 4].index)]

# Suppression des colonnes innutiles

## Suppression de ```Age_now```

Nous pouvons supprimer la colonne ```Age_now``` car les données qu'elle contient sont identique à celles de la colonne ```year_of_birth```

In [4]:
dfWithoutAgeNow = dfEnought.drop('Age_now', axis='columns')

## Suppression de ```contact_id```

In [5]:
dfWithoutContactID = dfWithoutAgeNow.drop('contact_id',axis='columns')

## Suppression des noms de médicaments

In [6]:
dfGroupedByMoleculeLabel = dfWithoutContactID.groupby('product_atc_code')[['molecule_label','short_name','long_name','Classe','product_atc']].count()
dfGroupedByMoleculeLabel

Unnamed: 0_level_0,molecule_label,short_name,long_name,Classe,product_atc
product_atc_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C02AC06,4288,4288,4288,4288,4288
C03BX03,932,932,932,932,932
C03CA01,877,877,877,877,877
C03DA01,187,187,187,187,187
C03DA04,27,27,27,27,27
C03EA04,75,75,75,75,75
C03EB01,14,14,14,14,14
C07AA05,2,2,2,2,2
C07AB03,6466,6466,6466,6466,6466
C07AB04,39,39,39,39,39


Nous voyons que les différentes colonnes de noms de médicaments sont identiques, nous pouvons donc n'en garder qu'une seule. Nous choisirons de garder ```product_atc_code```

In [7]:
dropColumnNames = dfGroupedByMoleculeLabel.columns.to_list()
dfWithATCCode = dfWithoutContactID.drop(dropColumnNames, axis='columns')

In [8]:
dfWithATCCode

Unnamed: 0,person_id,specialty_label,contact_date,cip,dosage_1,dose_1,dose_2,product_atc_code,box,quantity,...,HbA1c_der_date,HbA1c_der_mesure,gender_code,Age_presc,year_of_birth,Poids,der_date_poids,Taille,der_date_taille,first_contact_date
1,263659.0,Médecin généraliste,2014-09-24,3.400960e+12,20.0,240.0,24.0,C03CA01,1.0,4.00,...,NaT,,M,96.0,1918.0,,NaT,,NaT,1998-04-02
2,263659.0,Médecin généraliste,2014-09-12,3.400960e+12,20.0,240.0,24.0,C03CA01,1.0,4.00,...,NaT,,M,96.0,1918.0,,NaT,,NaT,1998-04-02
3,263659.0,Médecin généraliste,2015-03-26,3.400960e+12,20.0,240.0,24.0,C03CA01,1.0,4.00,...,NaT,,M,97.0,1918.0,,NaT,,NaT,1998-04-02
4,263659.0,Médecin généraliste,2015-06-05,3.400960e+12,20.0,240.0,24.0,C03CA01,1.0,4.00,...,NaT,,M,97.0,1918.0,,NaT,,NaT,1998-04-02
6,25182917.0,Médecin généraliste,2013-03-27,3.400960e+12,300.0,16800.0,56.0,C09XA02,2.0,1.00,...,NaT,,M,63.0,1950.0,,NaT,,NaT,2011-04-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50625,26636453.0,Médecin généraliste,2016-11-16,3.400940e+12,500.0,15000.0,30.0,C03CA01,1.0,0.75,...,2017-10-03,5.60,M,79.0,1937.0,83.9,2018-02-21,,NaT,2012-02-14
50627,18889430.0,Médecin généraliste,2013-02-01,3.400940e+12,500.0,15000.0,30.0,C03CA01,1.0,0.50,...,2017-05-15,8.01,M,85.0,1928.0,,NaT,,NaT,2007-12-02
50629,2222336.0,Médecin généraliste,2013-07-01,3.400940e+12,500.0,15000.0,30.0,C03CA01,2.0,0.25,...,NaT,,F,78.0,1935.0,,NaT,,NaT,2006-02-23
50631,11363518.0,Médecin généraliste,2013-01-24,3.400940e+12,500.0,15000.0,30.0,C03CA01,1.0,0.25,...,NaT,,M,80.0,1933.0,,NaT,,NaT,1998-01-27


## Suppression des colonnes ```'*der*'```

Les colonnes ```*der*``` contiennent la dernière données. Cette donnée peut être récupérée grâce à la date de la visite et à aux valeurs mesurées. Par exemple, il n'est pas nécessaire d'avoir une colonne ```der_date``` ou ```der_mesure```. Les données de ces deux types de colonnes peuvent être récupéré grâce à la ligne qui correspond à la dernière date de la mesure, que l'on peut trouver grâce à la colonne ```contact_date```

In [9]:
derColumnNames = []

for c in dfWithATCCode.columns:
    if ('der_date' in c) or ('der_mesure' in c):
        derColumnNames.append(c)

dfWithoutDer = dfWithATCCode.drop(derColumnNames,axis='columns')

# Traîtement des données

## Conversion des données

### Ajout du temps entre chaque visite (ce que l'on veut prédire)

In [10]:
wait_time = dfWithoutDer.contact_date - dfWithoutDer.first_contact_date
wait_time = wait_time.dt.total_seconds() / (24 * 3600)
dfWithoutDer = dfWithoutDer.drop('first_contact_date',axis='columns')
dfWithoutDer['wait_time'] = wait_time

### Encodage des valeurs non numériques

In [11]:
specialtyEncoder = LabelEncoder()
ATCCodeEncoder = LabelEncoder()
frequencyLabelEncoder = LabelEncoder()
traitementAutresLabelEncoder = LabelEncoder()
traitementInsulineLabelEncoder = LabelEncoder()
genderEncoder = LabelEncoder()

dfWithoutDer.specialty_label = specialtyEncoder.fit_transform(dfWithoutDer.specialty_label)
dfWithoutDer.product_atc_code = ATCCodeEncoder.fit_transform(dfWithoutDer.product_atc_code)
dfWithoutDer.frequency_label = frequencyLabelEncoder.fit_transform(dfWithoutDer.frequency_label.astype(str))
dfWithoutDer.Traitement_Autres_A10_dep_201701 = traitementAutresLabelEncoder.fit_transform(dfWithoutDer.Traitement_Autres_A10_dep_201701.astype(str))
dfWithoutDer.Traitement_Insulines_dep_201701 = traitementInsulineLabelEncoder.fit_transform(dfWithoutDer.Traitement_Insulines_dep_201701.astype(str))
dfWithoutDer.gender_code = ATCCodeEncoder.fit_transform(dfWithoutDer.gender_code)


### Conversion en ```TimeSeries```

on définit simplement le nouvel index comme la colonne donnant l'intervalle de temps entre chaque visite

In [12]:
dfWithoutDer

Unnamed: 0,person_id,specialty_label,contact_date,cip,dosage_1,dose_1,dose_2,product_atc_code,box,quantity,...,Tension Diastolique,Tension Systolique,Glycemie_prescription,HbA1c_prescription,gender_code,Age_presc,year_of_birth,Poids,Taille,wait_time
1,263659.0,2,2014-09-24,3.400960e+12,20.0,240.0,24.0,2,1.0,4.00,...,110.0,60.0,,,1,96.0,1918.0,,,6019.0
2,263659.0,2,2014-09-12,3.400960e+12,20.0,240.0,24.0,2,1.0,4.00,...,118.0,70.0,,,1,96.0,1918.0,,,6007.0
3,263659.0,2,2015-03-26,3.400960e+12,20.0,240.0,24.0,2,1.0,4.00,...,120.0,70.0,,,1,97.0,1918.0,,,6202.0
4,263659.0,2,2015-06-05,3.400960e+12,20.0,240.0,24.0,2,1.0,4.00,...,120.0,70.0,,,1,97.0,1918.0,,,6273.0
6,25182917.0,2,2013-03-27,3.400960e+12,300.0,16800.0,56.0,31,2.0,1.00,...,145.0,85.0,,,1,63.0,1950.0,,,723.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50625,26636453.0,2,2016-11-16,3.400940e+12,500.0,15000.0,30.0,2,1.0,0.75,...,120.0,60.0,,,1,79.0,1937.0,83.9,,1737.0
50627,18889430.0,2,2013-02-01,3.400940e+12,500.0,15000.0,30.0,2,1.0,0.50,...,120.0,70.0,,,1,85.0,1928.0,,,1888.0
50629,2222336.0,2,2013-07-01,3.400940e+12,500.0,15000.0,30.0,2,2.0,0.25,...,130.0,70.0,,,0,78.0,1935.0,,,2685.0
50631,11363518.0,2,2013-01-24,3.400940e+12,500.0,15000.0,30.0,2,1.0,0.25,...,139.0,70.0,,,1,80.0,1933.0,,,5476.0


In [13]:
ts = dfWithoutDer.set_index(['person_id','contact_date']).sort_index()
ts

Unnamed: 0_level_0,Unnamed: 1_level_0,specialty_label,cip,dosage_1,dose_1,dose_2,product_atc_code,box,quantity,frequency_label,duration,...,Tension Diastolique,Tension Systolique,Glycemie_prescription,HbA1c_prescription,gender_code,Age_presc,year_of_birth,Poids,Taille,wait_time
person_id,contact_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
291.0,2016-10-28,2,3.400950e+12,2.5,75.0,30.0,10,6.0,2.0,0,84.0,...,130.0,60.0,,,1,86.0,1930.0,81.0,179.0,6603.0
291.0,2016-11-04,2,3.400950e+12,2.5,75.0,30.0,10,3.0,1.0,0,84.0,...,142.0,65.0,,,1,86.0,1930.0,81.0,179.0,6610.0
291.0,2016-11-21,2,3.400950e+12,2.5,75.0,30.0,10,3.0,1.0,0,84.0,...,140.0,75.0,,,1,86.0,1930.0,81.0,179.0,6627.0
291.0,2017-01-16,2,3.400950e+12,2.5,75.0,30.0,10,3.0,1.0,0,84.0,...,125.0,70.0,,,1,87.0,1930.0,81.0,179.0,6683.0
291.0,2017-11-20,2,3.400950e+12,2.5,75.0,30.0,10,3.0,1.0,0,84.0,...,115.0,60.0,,,1,87.0,1930.0,81.0,179.0,6991.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34823672.0,2017-04-15,2,3.400950e+12,100.0,9000.0,90.0,8,1.0,1.0,0,84.0,...,140.0,60.0,,,0,73.0,1944.0,72.2,148.0,0.0
34823672.0,2017-06-26,2,3.400950e+12,100.0,9000.0,90.0,8,1.0,1.0,0,56.0,...,130.0,70.0,,,0,73.0,1944.0,72.2,148.0,72.0
34823672.0,2017-08-30,2,3.400950e+12,100.0,9000.0,90.0,8,1.0,1.0,0,56.0,...,130.0,70.0,,,0,73.0,1944.0,72.2,148.0,137.0
34823672.0,2017-09-11,2,3.400950e+12,100.0,9000.0,90.0,8,1.0,1.0,0,56.0,...,140.0,80.0,,,0,73.0,1944.0,72.2,148.0,149.0


# Prédiction

## Création des données d'entraînement/test

In [270]:
yColumNames = ['product_atc_code','wait_time']

xDf = ts.drop(yColumNames,axis='columns')
yDf = ts.loc[:,yColumNames]

xList,yList = [],[]
sliceNumber = 4

for i,_ in ts.groupby('person_id'):
    xList.append(xDf.loc[i].to_numpy().astype('float32')[:sliceNumber].transpose())
    yList.append(yDf.loc[i].to_numpy().astype('float32')[:sliceNumber])

xData = np.array(xList).reshape((len(xList),xList[0].shape[0],sliceNumber))
yData = np.array(yList).reshape((sliceNumber,len(yList),yList[0].shape[1]))

In [271]:
print(f"{xData.shape}, {yData.shape}")

(4227, 21, 4), (4, 4227, 2)


In [272]:
trainUse = int(xData.shape[0] * 80 / 100)
testUse = xData.shape[0] - trainUse

xTrain, xTest, yTrain, yTest = xData[:trainUse],xData[-testUse:],yData[:,:trainUse],yData[:,-testUse:]

#xTrain = xTrain.reshape(xTrain.shape[0],1,xTrain.shape[1],xTrain.shape[2])
#xTest = xTest.reshape(xTest.shape[0],1,xTest.shape[1],xTest.shape[2])

## Création du modèle

In [273]:
print(f"{xTrain.shape}, {yTrain.shape}")

(3381, 21, 4), (4, 3381, 2)


In [286]:
model = models.Sequential()
kernelNumber = 30
model.add(layers.Conv1D(kernelNumber,sliceNumber,activation='relu',input_shape=(xTrain.shape[1],sliceNumber)))
model.add(layers.MaxPool1D(1))
model.add(layers.Flatten())
model.add(layers.Dense(2,activation='softmax'))

In [287]:
model.summary()

Model: "sequential_95"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_22 (Conv1D)           (None, 18, 30)            510       
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 18, 30)            0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 540)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 1082      
Total params: 1,592
Trainable params: 1,592
Non-trainable params: 0
_________________________________________________________________


In [292]:
model.compile(optimizer='rmsprop',loss='kullback_leiber_divergence',metrics=['accuracy'])

In [293]:
for i in range(yTrain.shape[0]):
    print(f"{i}/{yTrain.shape[0]}")
    model.fit(xTrain,yTrain[i],validation_data=(xTest,yTest[i]))

0/4


ValueError: in user code:

    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:748 train_step
        loss = self.compiled_loss(
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:187 __call__
        self.build(y_pred)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:140 build
        self._losses = nest.map_structure(self._get_loss_object, self._losses)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/util/nest.py:635 map_structure
        structure[0], [func(*x) for x in entries],
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/util/nest.py:635 <listcomp>
        structure[0], [func(*x) for x in entries],
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:263 _get_loss_object
        loss = losses_mod.get(loss)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1895 get
        return deserialize(identifier)
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1850 deserialize
        return deserialize_keras_object(
    /home/adrien/.local/lib/python3.8/site-packages/tensorflow/python/keras/utils/generic_utils.py:377 deserialize_keras_object
        raise ValueError(

    ValueError: Unknown loss function: kullback_leiber_divergence


In [268]:
model.fit(xTrain,yTrain[i])



<tensorflow.python.keras.callbacks.History at 0x7f6bc03c38e0>