In [1]:
import pandas as pd
import numpy as np

# I - Consolidations of datasets

In [2]:
patients = [103001, 111001, 113001, 124001]

df_ml_conso = pd.read_csv('../../datasets/2_dataset_creation/df_ml_{}.csv'.format(patients[0]))
df_ml_conso['patient'] = patients[0]

for patient in patients[1:]:
    df_temp = pd.read_csv('../../datasets/2_dataset_creation/df_ml_{}.csv'.format(patient))
    df_temp['patient'] = patient
    df_ml_conso = pd.concat([df_ml_conso,df_temp], axis=0)

display(df_ml_conso.head())

print('\nPatient count by classification:')
display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))

print('\nPatient average by classification:')
display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg'], index='patient',aggfunc=np.mean))

Unnamed: 0,timestamp_start,timestamp_end,qSQI_score,cSQI_score,sSQI_score,kSQI_score,pSQI_score,basSQI_score,classif,classif_avg,patient
0,28800001,28809000,0.88,0.63,3.34,13.47,0.52,0.93,1,1.0,103001
1,28809001,28818000,0.93,0.66,3.3,13.21,0.52,0.93,1,1.0,103001
2,28818001,28827000,0.93,0.7,3.3,13.18,0.53,0.94,1,1.0,103001
3,28827001,28836000,1.0,0.54,3.18,12.14,0.52,0.92,1,1.0,103001
4,28836001,28845000,1.0,0.57,3.24,12.76,0.51,0.92,1,1.0,103001



Patient count by classification:


Unnamed: 0_level_0,classif
patient,Unnamed: 1_level_1
103001,267
111001,10071
113001,400
124001,533



Patient average by classification:


Unnamed: 0_level_0,classif,classif_avg
patient,Unnamed: 1_level_1,Unnamed: 2_level_1
103001,0.561798,0.724159
111001,0.348128,0.419475
113001,0.5225,0.661358
124001,0.110694,0.286481


# II - Appying threshold of quality

In [3]:
# Setting a threshold : proportion of optimal by observation
classif_threshold = 0.95

df_ml_conso['classif_threshold'] = df_ml_conso['classif_avg'].apply(lambda x: 1 if x >= classif_threshold else 0)

print('\nPatient count:')
display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))

print('\nPatient average by classification:')
display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))


Patient count:


Unnamed: 0_level_0,classif
patient,Unnamed: 1_level_1
103001,267
111001,10071
113001,400
124001,533



Patient average by classification:


Unnamed: 0_level_0,classif,classif_avg,classif_threshold
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
103001,0.561798,0.724159,0.565543
111001,0.348128,0.419475,0.35349
113001,0.5225,0.661358,0.535
124001,0.110694,0.286481,0.129456


# III - Equalisation of repartition by patient

In [4]:
df_ml_conso_balanced = pd.DataFrame()

for patient in patients:
    df_class1 = df_ml_conso[(df_ml_conso['patient'] == patient) & (df_ml_conso['classif_threshold'] ==1)]
    df_class0 = df_ml_conso[(df_ml_conso['patient'] == patient) & (df_ml_conso['classif_threshold'] ==0)]

    if df_class1.shape[0] >= df_class0.shape[0]:
        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,
                              df_class0,
                              df_class1.sample(df_class0.shape[0])]
                            )
    else:
        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,
                        df_class0.sample(df_class1.shape[0]),
                        df_class1]
                    )

print('\nPatient count:')
display(pd.pivot_table(df_ml_conso_balanced, values=['classif'], index='patient',aggfunc='count'))

print('\nPatient average by classification:')
display(pd.pivot_table(df_ml_conso_balanced, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))

print('\nProportion of each class')
print(df_ml_conso_balanced['classif_threshold'].value_counts())


Patient count:


Unnamed: 0_level_0,classif
patient,Unnamed: 1_level_1
103001,232
111001,7120
113001,372
124001,138



Patient average by classification:


Unnamed: 0_level_0,classif,classif_avg,classif_threshold
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
103001,0.49569,0.682545,0.5
111001,0.492416,0.549158,0.5
113001,0.486559,0.635869,0.5
124001,0.427536,0.585219,0.5



Proportion of each class
1    3931
0    3931
Name: classif_threshold, dtype: int64


# IV - Export

In [5]:
df_ml_conso_balanced = df_ml_conso_balanced.drop(['patient', 'classif', 'classif_avg'], axis=1)
df_ml_conso_balanced.rename(columns={df_ml_conso_balanced.columns[-1]: "classification"}, inplace=True)
display(df_ml_conso_balanced.head())

Unnamed: 0,timestamp_start,timestamp_end,qSQI_score,cSQI_score,sSQI_score,kSQI_score,pSQI_score,basSQI_score,classification
19,28971001,28980001,0.57,0.6,-0.35,6.17,0.5,0.83,0
20,28980002,28989002,0.85,0.59,2.97,10.79,0.51,0.94,0
67,29403003,29412003,0.8,0.53,2.5,9.13,0.53,0.91,0
68,29412004,29421004,0.79,0.62,0.93,1.96,0.51,0.79,0
134,57606009,57615009,0.96,0.57,4.13,18.88,0.52,0.98,0


In [6]:
df_ml_conso_balanced.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_balanced.csv', index=False)