In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_row', 111)
pd.set_option('display.max_column', 111)
data = pd.read_excel('dataset.xlsx')

In [3]:
df = data.copy()

In [4]:
missing_rate = df.isna().sum()/df.shape[0]

In [5]:
blood_columns = list(df.columns[(missing_rate<0.9)&(missing_rate>0.88)])
viral_columns = list(df.columns[(missing_rate < 0.80)&(missing_rate > 0.75)])

In [6]:
key_columns = ['Patient age quantile','SARS-Cov-2 exam result']

In [7]:
df = df[key_columns + blood_columns + viral_columns]#

In [8]:
df.shape

(5644, 33)

# Train_Test & Nettoyage & Encodage


In [9]:
from  sklearn.model_selection import train_test_split

In [10]:
trainset , testset = train_test_split(df, test_size=0.2, random_state=0)

In [11]:
trainset.shape

(4515, 33)

In [12]:
def encodage(df):
    code = {'negative':0,
            'positive':1,
            'not_detected':0,
            'detected':1}
    
    for col in df.select_dtypes('object').columns:
        df.loc[:,col] = df[col].map(code)
        
    return df

In [13]:
def feature_engineering(df):
    df['est malade'] = df[viral_columns].sum(axis=1) >= 1
    df = df.drop(viral_columns, axis=1)
    return df

In [14]:
def imputation(df):
    #df['is na'] = (df['Parainfluenza 3'].isna()) | (df['Leukocytes'].isna())
    #df = df.fillna(-999)
    df = df.dropna(axis=0)
    return  df

In [15]:
def preprocessing(df):
    
    df = encodage(df)
    df = feature_engineering(df)
    df = imputation(df)
    
    X = df.drop('SARS-Cov-2 exam result', axis=1)
    y = df['SARS-Cov-2 exam result']
    
    print(y.value_counts())
    
    return X, y

In [16]:
X_train, y_train= preprocessing(trainset) 
X_test, y_test = preprocessing(testset) 

0    422
1     65
Name: SARS-Cov-2 exam result, dtype: int64
0    95
1    16
Name: SARS-Cov-2 exam result, dtype: int64


In [17]:
X_train

Unnamed: 0,Patient age quantile,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,Basophils,Mean corpuscular hemoglobin (MCH),Eosinophils,Mean corpuscular volume (MCV),Monocytes,Red blood cell distribution width (RDW),est malade
543,18,1.358055,1.356092,-0.228491,-0.438097,1.142196,-0.517481,0.244149,0.275501,-0.223767,0.178175,-0.793368,0.066045,0.987864,-0.005877,False
203,9,1.174947,0.854844,-1.019885,-1.784415,0.824849,-0.193377,-0.851210,-0.317234,-0.223767,-0.083183,1.608576,0.366488,0.042388,-0.448160,True
1624,10,1.037616,1.481403,-1.823841,0.347255,1.142196,0.966573,1.837398,-1.135374,-0.223767,0.387261,-0.835508,-0.454724,-0.377823,-0.978899,False
500,6,-0.038148,0.165628,0.914633,0.122869,-0.515058,1.316264,0.841616,0.044529,0.998070,1.275878,0.133698,1.007433,-0.456613,-0.801985,False
34,17,0.808730,1.042812,-0.278739,1.581381,0.701437,-0.261609,1.040773,0.000005,0.387152,0.439533,0.049419,-0.014074,-0.509139,-0.094334,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3299,19,0.785841,0.666876,1.366858,1.693575,0.119635,2.638266,-0.253742,1.486016,0.081693,0.909977,0.133698,1.147641,-1.008140,-0.801985,False
5251,0,-2.533002,-2.465924,3.037579,-0.774677,-2.348615,3.167067,-0.452899,1.764295,-1.140144,-0.030911,-0.835508,0.206251,-1.008140,-0.005877,True
1071,4,0.694287,0.729532,-0.140559,-1.447836,0.419350,1.145683,0.343725,-0.553771,1.914447,0.439533,0.217977,0.346459,1.303022,-0.713529,True
4484,4,1.541164,1.669372,1.090499,-0.213711,1.829779,-1.182746,0.742040,1.792123,-1.140144,-0.396813,-0.835508,-0.835285,0.042388,0.878687,True


In [18]:
DatasetTrain = pd.concat([X_train, y_train], axis=1)
DatasetTest = pd.concat([X_test, y_test], axis=1)

In [19]:
# Écrivez les données dans un fichier Excel
DatasetTrain.to_excel('DatasetTrain.xlsx', index=False)

In [20]:
# Écrivez les données dans un fichier Excel
DatasetTest.to_excel('DatasetTest.xlsx', index=False)