In [80]:
import pandas as pd
import os

In [81]:
df_bio = pd.read_pickle('../data_raw/df_bio.pkl')
df_condition = pd.read_pickle('../data_raw/df_condition.pkl')
df_dedup_deterministic = pd.read_pickle('../data_raw/df_dedup_deterministic.pkl')
df_dedup_proba = pd.read_pickle('../data_raw/df_dedup_proba.pkl')
df_note = pd.read_pickle('../data_raw/df_note.pkl')
df_person = pd.read_pickle('../data_raw/df_person.pkl')
df_visit = pd.read_pickle('../data_raw/df_visit.pkl')

**df_person**

In [82]:
# Le CDM est identique pour tous les patients, on le supprime
assert(len(df_person['cdm_source'].unique()) == 1)
df_person_clean = df_person.drop(columns=['cdm_source'])

# On supprime les patients sans date de naissance, ou avec une date de naissance trop ancienne (incoherent)
df_person_clean = df_person_clean[df_person_clean['birth_datetime'] > '1920-01-01']

# On adopte une unique convention pour le genre
df_person_clean['gender_source_value'] = df_person_clean['gender_source_value'].replace({'f': 'female', 'm': 'male'})
assert(len(df_person_clean.gender_source_value.unique()) == 2)

# On supprime les hommes de l'étude
df_person_clean = df_person_clean[df_person_clean['gender_source_value'] == 'female']
assert(len(df_person_clean.gender_source_value.unique()) == 1)

# On sauvegarde les données nettoyées dans un répertoire spécifique
output_dir = '../data_clean'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'df_person.pkl')
df_person_clean.to_pickle(output_path)

**df_bio**

In [83]:
df_bio_clean = df_bio

output_path = os.path.join(output_dir, 'df_bio.pkl')
df_bio_clean.to_pickle(output_path)

**df_visit**

In [84]:
df_visit_clean = df_visit[df_visit['person_id'].isin(df_person_clean['person_id'])]

output_path = os.path.join(output_dir, 'df_visit.pkl')
df_visit_clean.to_pickle(output_path)

**df_note**

In [85]:
df_note_clean = df_note.drop(columns=['cdm_source'])
df_note_clean = df_note_clean[df_note_clean['visit_occurrence_id'].isin(df_visit_clean['visit_occurrence_id'])]

output_path = os.path.join(output_dir, 'df_note.pkl')
df_note_clean.to_pickle(output_path)


In [86]:
df_condition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   visit_occurrence_id      2152 non-null   float64
 1   person_id                2152 non-null   float64
 2   condition_occurrence_id  2152 non-null   float64
 3   condition_source_value   2152 non-null   object 
dtypes: float64(3), object(1)
memory usage: 67.4+ KB


In [87]:
df_condition_clean = df_condition[df_condition['person_id'].isin(df_person_clean['person_id'])]

output_path = os.path.join(output_dir, 'df_condition.pkl')
df_condition_clean.to_pickle(output_path)

In [88]:
df_person_dedup_det = pd.merge(
    df_person_clean,
    df_dedup_deterministic,
    on='person_id',
    how='outer'
)
df_person_dedup_det['unique_person_id'] = df_person_dedup_det['unique_person_id'].fillna(df_person_dedup_det['person_id'])
df_person_dedup_det = df_person_dedup_det.drop_duplicates(['unique_person_id'], keep='first')

def deduplicate_proba(df_person, df_dedup_proba, score):
    df_dedup_proba_score = df_dedup_proba[df_dedup_proba['prob'] > score]
    df_person_dedup_proba = pd.merge(df_person, df_dedup_proba_score, on='person_id', how='outer')
    df_person_dedup_proba['unique_person_id'] = df_person_dedup_proba['unique_person_id'].fillna(df_person_dedup_proba['person_id'])
    df_person_dedup_proba = df_person_dedup_proba.drop_duplicates(['unique_person_id'], keep='first')
    return df_person_dedup_proba

df_person_dedup_proba_90 = deduplicate_proba(df_person_clean, df_dedup_proba, score=0.90)
df_person_dedup_proba_20 = deduplicate_proba(df_person_clean, df_dedup_proba, score=0.20)

In [89]:
import os

print("Original patient data:")
print(f"Unique patients: {df_person_clean['person_id'].nunique()}")

print('\nDeterministic deduplication:')
print(f"Unique patients: {df_person_dedup_det['unique_person_id'].nunique()}")

print('\nProbabilistic deduplication (threshold 0.90):')
print(f"Unique patients: {df_person_dedup_proba_90['unique_person_id'].nunique()}")

print('\nProbabilistic deduplication (threshold 0.20):')
print(f"Unique patients: {df_person_dedup_proba_20['unique_person_id'].nunique()}")

Original patient data:
Unique patients: 975

Deterministic deduplication:
Unique patients: 967

Probabilistic deduplication (threshold 0.90):
Unique patients: 959

Probabilistic deduplication (threshold 0.20):
Unique patients: 953


In [101]:
df_person_dedup_proba_90.to_pickle(os.path.join(output_dir, 'df_person.pkl'))

df_person_dedup = df_person_dedup_proba_90

df_bio_clean.info()
df_person_dedup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6188 entries, 0 to 6187
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   measurement_id        6188 non-null   float64       
 1   visit_occurrence_id   6188 non-null   float64       
 2   measurement_datetime  6188 non-null   datetime64[ns]
 3   concept_source_value  6188 non-null   object        
 4   transformed_value     6188 non-null   float64       
 5   transformed_unit      6188 non-null   object        
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 290.2+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 959 entries, 0 to 993
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   birth_datetime       940 non-null    datetime64[ns]
 1   death_datetime       549 non-null    datetime64[ns]
 2   gender_source_value  940 

In [None]:
def update_with_unique_id(df_visit, df_person_dedup):
    df_updated = pd.merge(
        df_visit,
        df_person_dedup[['person_id', 'unique_person_id']],
        on='person_id',
        how='left'
    )

    df_updated['unique_person_id'] = df_updated['unique_person_id'].fillna(df_updated['person_id'])
    return df_updated

df_visit_dedup = update_with_unique_id(df_visit_clean, df_person_dedup)
df_condition_dedup = update_with_unique_id(df_condition_clean, df_person_dedup)


In [None]:
def finalize_person_id(df):
    df = df.drop(columns=['person_id'], errors='ignore')
    df = df.rename(columns={'unique_person_id': 'person_id'})
    return df

df_person_final = finalize_person_id(df_person_dedup)
df_visit_final = finalize_person_id(df_visit_dedup)
df_condition_final = finalize_person_id(df_condition_dedup)

df_person_final.to_pickle(os.path.join(output_dir, 'df_person.pkl'))
df_visit_final.to_pickle(os.path.join(output_dir, 'df_visit.pkl'))
df_condition_final.to_pickle(os.path.join(output_dir, 'df_condition.pkl'))