In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# I - Consolidations of datasets

In [2]:
patients = [103001, 111001, 113001, 124001]

df_ml_conso = pd.read_csv('../../datasets/2_dataset_creation_2s/df_ml_{}_norm.csv'.format(patients[0]))
df_ml_conso['patient'] = patients[0]

for patient in patients[1:]:
    df_temp = pd.read_csv('../../datasets/2_dataset_creation_2s/df_ml_{}_norm.csv'.format(patient))
    df_temp['patient'] = patient
    df_ml_conso = pd.concat([df_ml_conso,df_temp], axis=0)

display(df_ml_conso.head())

print('\nPatient count by classification:')
display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))

print('\nPatient average by classification:')
display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg'], index='patient',aggfunc=np.mean))

Unnamed: 0,timestamp_start,timestamp_end,qSQI_score,cSQI_score,sSQI_score,kSQI_score,pSQI_score,basSQI_score,classif,classif_avg,patient
0,28800001,28802000,0.5,0.77,3.13,11.74,0.51,0.98,1,1.0,103001
1,28802001,28804000,1.0,0.67,3.32,13.03,0.52,0.97,1,1.0,103001
2,28804001,28806000,1.0,0.59,3.77,16.35,0.51,0.97,1,1.0,103001
3,28806001,28808000,0.5,0.0,3.12,12.99,0.53,0.97,1,1.0,103001
4,28808001,28810000,0.67,1.16,3.11,11.62,0.52,0.95,1,1.0,103001



Patient count by classification:


Unnamed: 0_level_0,classif
patient,Unnamed: 1_level_1
103001,1200
111001,45321
113001,1800
124001,2400



Patient average by classification:


Unnamed: 0_level_0,classif,classif_avg
patient,Unnamed: 1_level_1,Unnamed: 2_level_1
103001,0.554167,0.723268
111001,0.348668,0.419445
113001,0.525556,0.661498
124001,0.115,0.286503


# II - Appying threshold of quality

In [3]:
# Setting a threshold : proportion of optimal by observation
classif_threshold = 0.95

df_ml_conso['classif_threshold'] = df_ml_conso['classif_avg'].apply(lambda x: 1 if x >= classif_threshold else 0)

print('\nPatient count:')
display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))

print('\nPatient average by classification:')
display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))


Patient count:


Unnamed: 0_level_0,classif
patient,Unnamed: 1_level_1
103001,1200
111001,45321
113001,1800
124001,2400



Patient average by classification:


Unnamed: 0_level_0,classif,classif_avg,classif_threshold
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
103001,0.554167,0.723268,0.56
111001,0.348668,0.419445,0.353214
113001,0.525556,0.661498,0.535556
124001,0.115,0.286503,0.127917


# III - Creation of a validation dataset

In [4]:
df_ml_conso_for_model, df_ml_conso_validation = train_test_split(df_ml_conso, test_size=0.2, random_state=42)

In [5]:
df_ml_conso_validation = df_ml_conso_for_model.drop(['patient', 'classif', 'classif_avg'], axis=1)
df_ml_conso_validation.rename(columns={df_ml_conso_validation.columns[-1]: "classification"}, inplace=True)

for element in  [df_ml_conso, df_ml_conso_for_model, df_ml_conso_validation]:
    display(element.head())
    print('Shape : {}'.format(element.shape))

df_ml_conso_validation.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_validation_norm_2s.csv', index=False)

Unnamed: 0,timestamp_start,timestamp_end,qSQI_score,cSQI_score,sSQI_score,kSQI_score,pSQI_score,basSQI_score,classif,classif_avg,patient,classif_threshold
0,28800001,28802000,0.5,0.77,3.13,11.74,0.51,0.98,1,1.0,103001,1
1,28802001,28804000,1.0,0.67,3.32,13.03,0.52,0.97,1,1.0,103001,1
2,28804001,28806000,1.0,0.59,3.77,16.35,0.51,0.97,1,1.0,103001,1
3,28806001,28808000,0.5,0.0,3.12,12.99,0.53,0.97,1,1.0,103001,1
4,28808001,28810000,0.67,1.16,3.11,11.62,0.52,0.95,1,1.0,103001,1


Shape : (50721, 12)


Unnamed: 0,timestamp_start,timestamp_end,qSQI_score,cSQI_score,sSQI_score,kSQI_score,pSQI_score,basSQI_score,classif,classif_avg,patient,classif_threshold
33380,66762076,66764075,1.0,0.0,4.88,28.97,0.43,0.9,1,1.0,111001,1
9669,19338286,19340285,0.5,0.71,3.92,19.29,0.55,0.93,1,1.0,111001,1
7697,15394265,15396264,0.33,0.83,0.07,-1.41,0.47,0.52,0,0.0,111001,0
8111,16222273,16224272,0.57,0.73,-0.11,-0.11,0.48,0.6,0,0.0,111001,0
16857,33714942,33716941,0.33,0.3,-0.6,19.35,0.46,0.84,1,1.0,111001,1


Shape : (40576, 12)


Unnamed: 0,timestamp_start,timestamp_end,qSQI_score,cSQI_score,sSQI_score,kSQI_score,pSQI_score,basSQI_score,classification
33380,66762076,66764075,1.0,0.0,4.88,28.97,0.43,0.9,1
9669,19338286,19340285,0.5,0.71,3.92,19.29,0.55,0.93,1
7697,15394265,15396264,0.33,0.83,0.07,-1.41,0.47,0.52,0
8111,16222273,16224272,0.57,0.73,-0.11,-0.11,0.48,0.6,0
16857,33714942,33716941,0.33,0.3,-0.6,19.35,0.46,0.84,1


Shape : (40576, 9)


# IV - Equalisation of repartition by patient

In [6]:
df_ml_conso_balanced = pd.DataFrame()

for patient in patients:
    df_class1 = df_ml_conso_for_model[(df_ml_conso_for_model['patient'] == patient) & (df_ml_conso_for_model['classif_threshold'] ==1)]
    df_class0 = df_ml_conso_for_model[(df_ml_conso_for_model['patient'] == patient) & (df_ml_conso_for_model['classif_threshold'] ==0)]

    if df_class1.shape[0] >= df_class0.shape[0]:
        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,
                              df_class0,
                              df_class1.sample(df_class0.shape[0])]
                            )
    else:
        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,
                        df_class0.sample(df_class1.shape[0]),
                        df_class1]
                    )

print('\nPatient count:')
display(pd.pivot_table(df_ml_conso_balanced, values=['classif'], index='patient',aggfunc='count'))

print('\nPatient average by classification:')
display(pd.pivot_table(df_ml_conso_balanced, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))

print('\nProportion of each class')
print(df_ml_conso_balanced['classif_threshold'].value_counts())


Patient count:


Unnamed: 0_level_0,classif
patient,Unnamed: 1_level_1
103001,820
111001,25680
113001,1342
124001,472



Patient average by classification:


Unnamed: 0_level_0,classif,classif_avg,classif_threshold
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
103001,0.493902,0.689179,0.5
111001,0.493341,0.549923,0.5
113001,0.488077,0.636614,0.5
124001,0.45339,0.598689,0.5



Proportion of each class
1    14157
0    14157
Name: classif_threshold, dtype: int64


# V - Export

In [7]:
df_ml_conso_balanced = df_ml_conso_balanced.drop(['patient', 'classif', 'classif_avg'], axis=1)
df_ml_conso_balanced.rename(columns={df_ml_conso_balanced.columns[-1]: "classification"}, inplace=True)
display(df_ml_conso_balanced.head())

Unnamed: 0,timestamp_start,timestamp_end,qSQI_score,cSQI_score,sSQI_score,kSQI_score,pSQI_score,basSQI_score,classification
979,58358143,58360142,0.67,0.59,2.38,10.16,0.52,0.95,0
961,58322141,58324140,0.67,1.09,3.79,17.6,0.5,0.9,0
1185,58770181,58772180,0.33,0.8,1.72,6.64,0.54,0.88,0
90,28980002,28982001,0.5,0.67,2.78,9.1,0.5,0.95,0
682,57764045,57766044,0.8,0.56,4.05,19.19,0.52,0.98,0


In [8]:
df_ml_conso_balanced.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_balanced_norm_2s.csv', index=False)