In [195]:
import os
import numpy as np

import pandas as pd
from sklearn import metrics

from datetime import datetime

In [196]:
df = pd.read_csv("./avatars_echOvid_train_X.csv")

In [197]:
print(list(df.head()))

['id', 'Nom du centre', 'age', 'Sexe', 'Lieu de provenance du patient', 'Echographiste', 'Tabagisme actif', 'BPCO', 'Asthme', 'Autre antecedent respiratoire', 'Hypertension arterielle', 'Cardiopathie ischemique', 'Cardiopathie rythmique', 'Diabete de type 1', 'Diabete de type 2', 'Diabetes', 'Cancer  hemopathie maligne', 'Demence', 'Statut immunodeprime', "AINS au long cours (dans le cadre d'une pathologie suivie)", 'AINS ponctuel recent (cadre des symptomatologies COVID-19 suspect avere)', 'Tension arterielle systolique (mmHg)', 'Tension arterielle diastolique (mmHg)', 'Frequence cardiaque (puls. min)', 'Frequence respiratoire (resp. min)', 'Temperature (Celsius)', 'Confusion', 'Saturation O2', 'Date de debut de la symptomatologie', 'Zone anterieure droite haut', 'Zone anterieure droite bas', 'Zone anterieure gauche haut', 'Zone anterieure gauche bas', 'Zone posterieure droite haut', 'Zone posterieure droite bas', 'Zone posterieure gauche haut', 'Zone posterieure gauche bas', 'Oxygeno

In [198]:
#feature_cols = []
stade_cols = [
    'Zone anterieure droite bas',
    'Zone anterieure droite haut',
    'Zone anterieure gauche bas',
    'Zone anterieure gauche haut',
    'Zone posterieure droite bas',
    'Zone posterieure droite haut',
    'Zone posterieure gauche bas',
    'Zone posterieure gauche haut',
]

boolean_cols = [
    'Tabagisme actif',
    'BPCO',
    'Asthme',
    'Autre antecedent respiratoire',
    'Hypertension arterielle',
    'Cardiopathie ischemique',
    'Cardiopathie rythmique',
    'Diabete de type 1',
    'Diabete de type 2',
    'Diabetes',
    'Cancer  hemopathie maligne',
    'Demence',
    'Statut immunodeprime',
    "AINS au long cours (dans le cadre d'une pathologie suivie)",
    'AINS ponctuel recent (cadre des symptomatologies COVID-19 suspect avere)',
    'Confusion',
]

int_cols = [
    'Tension arterielle systolique (mmHg)',
    'Tension arterielle diastolique (mmHg)',
    'Frequence cardiaque (puls. min)',
    'Frequence respiratoire (resp. min)',
    'Temperature (Celsius)',
    'Saturation O2',
]

unique_cols = [
    # 'Nom du centre',
    'age',
    'Sexe',
    'Lieu de provenance du patient',
    'Echographiste',
    'Date de debut de la symptomatologie',
    'Oxygenotherapie'
]

feature_cols = stade_cols + boolean_cols + unique_cols + int_cols


In [199]:


def dt64_to_float(dt64):
  if type(dt64) is float and np.isnan(dt64):
    return .0 # np.nan
  try:
    time = datetime.strptime(dt64, '%Y-%m-%d')
    return time.timestamp()
  except Exception as e:
    print(dt64)
    print(e)
  return dt64

def convert(x):
  if x == 'Oui':
    return 1
  return 0

def pre_processing(df):
  dataFrame = df.copy()
  # Change date from object to float
  dataFrame['Date de debut de la symptomatologie'] = dataFrame['Date de debut de la symptomatologie'].map(dt64_to_float).to_numpy()

  # Change option to int
  dataFrame['Oxygenotherapie'] = dataFrame['Oxygenotherapie'].map({"Air ambiant": 1, "Moderee": 2, "Assistance respiratoire": 3, np.nan:0}).to_numpy()

  # Change string to int
  dataFrame['Sexe'] = dataFrame['Sexe'].map({"Masculin": 0, "Feminin": 1}).to_numpy()

  # Change string to int
  dataFrame['Lieu de provenance du patient'] = dataFrame['Lieu de provenance du patient'].map({"Domicile": 0, "EHPAD": 1, "Hopital": 2, "Autre": 3, np.nan:4}).to_numpy()

  # Change option to int
  dataFrame['Echographiste'] = dataFrame['Echographiste'].map({"Forme pour l'epidemie": 1, "Experience d'echographie": 2, "Expert": 3, np.nan:0}).to_numpy()


  dataFrame[boolean_cols] = dataFrame[boolean_cols].apply(lambda value: value.map(lambda x: 1 if x == 'Oui' else 0))

  # Change data from string "Stade X" to int X
  dataFrame[stade_cols] = dataFrame[stade_cols].apply(lambda value: value.map(lambda x: int(x[-1]) if type(x) is str else x))

  return dataFrame

data = pre_processing(df)


In [201]:
X =data[feature_cols]
display(X)


y = data['Outcome J28']

Unnamed: 0,Zone anterieure droite bas,Zone anterieure droite haut,Zone anterieure gauche bas,Zone anterieure gauche haut,Zone posterieure droite bas,Zone posterieure droite haut,Zone posterieure gauche bas,Zone posterieure gauche haut,Tabagisme actif,BPCO,...,Lieu de provenance du patient,Echographiste,Date de debut de la symptomatologie,Oxygenotherapie,Tension arterielle systolique (mmHg),Tension arterielle diastolique (mmHg),Frequence cardiaque (puls. min),Frequence respiratoire (resp. min),Temperature (Celsius),Saturation O2
0,2,2,2,1,2,2,2,2,0,0,...,0,1,1.587161e+09,0,164.0,64.0,80.0,35.0,37.7,79.0
1,1,0,0,0,0,0,0,0,0,0,...,0,2,1.585951e+09,2,147.0,91.0,97.0,15.0,37.0,99.0
2,1,0,0,1,2,1,2,1,0,0,...,1,2,0.000000e+00,3,119.0,72.0,89.0,27.0,37.2,97.0
3,0,0,0,0,0,0,0,0,1,0,...,2,2,0.000000e+00,0,145.0,78.0,93.0,,35.8,100.0
4,0,0,0,0,0,0,0,0,0,0,...,0,2,0.000000e+00,0,126.0,80.0,89.0,16.0,37.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,0,0,0,0,0,0,0,0,1,0,...,0,1,1.584313e+09,0,108.0,73.0,75.0,20.0,38.2,99.0
258,1,1,0,0,1,1,1,0,0,0,...,0,2,1.585264e+09,3,138.0,84.0,105.0,35.0,37.2,97.0
259,0,0,0,0,0,0,0,0,0,0,...,0,3,1.584745e+09,0,119.0,71.0,75.0,21.0,37.0,99.0
260,0,0,0,0,0,0,0,0,0,0,...,0,3,1.585606e+09,0,114.0,68.0,73.0,22.0,36.9,99.0


In [202]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [203]:
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.impute import SimpleImputer, MissingIndicator

transformer = FeatureUnion(
     transformer_list=[
         ('features', SimpleImputer(strategy='mean')),
         ('indicators', MissingIndicator())])

transformer = transformer.fit(X_train, y_train)

In [204]:
# train a logistic regression model on the training set
from sklearn.ensemble import RandomForestClassifier

clf = make_pipeline(transformer, RandomForestClassifier())

# fit model
clf.fit(X_train, y_train)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('features', SimpleImputer()),
                                                ('indicators',
                                                 MissingIndicator())])),
                ('randomforestclassifier', RandomForestClassifier())])

In [205]:
# make class predictions for the testing set
y_pred_class = clf.predict(X_test)

In [206]:
print(metrics.accuracy_score(y_test, y_pred_class))

0.8787878787878788


In [207]:
y_test.value_counts()


Back home              39
Hospitalization        23
Death                   2
Intensive care unit     2
Name: Outcome J28, dtype: int64

In [208]:
print(metrics.confusion_matrix(y_test, y_pred_class))


[[35  0  4  0]
 [ 0  0  2  0]
 [ 1  0 22  0]
 [ 0  0  1  1]]


In [209]:
y_test

194          Back home
153    Hospitalization
145          Back home
8            Back home
111          Back home
            ...       
184          Back home
15     Hospitalization
178          Back home
96           Back home
18               Death
Name: Outcome J28, Length: 66, dtype: object

In [211]:
test_data = pd.read_csv("./avatars_echOvid_test_X.csv")

pre_processing(test_data)

y_pred_class = clf.predict(X_test)

result_dataFrame =pd.DataFrame({'id':test_data['id'],'prediction':y_pred_class})
result_dataFrame = result_dataFrame.set_index('id')

display(result_dataFrame)
result_dataFrame.to_csv("./result.csv", sep =';')


Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
237,Back home
236,Hospitalization
253,Back home
92,Back home
53,Back home
...,...
151,Back home
39,Hospitalization
313,Back home
134,Back home
