In [39]:
import pandas as pd
import os

In [40]:
df_bio = pd.read_pickle('../data_raw/df_bio.pkl')
df_condition = pd.read_pickle('../data_raw/df_condition.pkl')
df_dedup_deterministic = pd.read_pickle('../data_raw/df_dedup_deterministic.pkl')
df_dedup_proba = pd.read_pickle('../data_raw/df_dedup_proba.pkl')
df_note = pd.read_pickle('../data_raw/df_note.pkl')
df_person = pd.read_pickle('../data_raw/df_person.pkl')
df_visit = pd.read_pickle('../data_raw/df_visit.pkl')

In [41]:
# Le CDM est identique pour tous les patients, on le supprime
assert(len(df_person['cdm_source'].unique()) == 1)
df_person_clean = df_person.drop(columns=['cdm_source'])

# On supprime les patients sans date de naissance, ou avec une date de naissance trop ancienne (incoherent)
df_person_clean = df_person_clean[df_person_clean['birth_datetime'] > '1920-01-01']

# On adopte une unique convention pour le genre
df_person_clean['gender_source_value'] = df_person_clean['gender_source_value'].replace({'f': 'female', 'm': 'male'})
assert(len(df_person_clean.gender_source_value.unique()) == 2)

# On supprime les hommes de l'étude
df_person_clean = df_person_clean[df_person_clean['gender_source_value'] == 'female']
assert(len(df_person_clean.gender_source_value.unique()) == 1)

# On sauvegarde les données nettoyées dans un répertoire spécifique
output_dir = '../data_clean'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'df_person.pkl')
df_person_clean.to_pickle(output_path)

In [42]:
df_person_dedup_det = pd.merge(
    df_person_clean,
    df_dedup_deterministic,
    on='person_id',
    how='outer'
)
df_person_dedup_det['unique_person_id'] = df_person_dedup_det['unique_person_id'].fillna(df_person_dedup_det['person_id'])
df_person_dedup_det = df_person_dedup_det.drop_duplicates(['unique_person_id'], keep='first')

def deduplicate_proba(df_person, df_dedup_proba, score):
    df_dedup_proba_score = df_dedup_proba[df_dedup_proba['prob'] > score]
    df_person_dedup_proba = pd.merge(df_person, df_dedup_proba_score, on='person_id', how='outer')
    df_person_dedup_proba['unique_person_id'] = df_person_dedup_proba['unique_person_id'].fillna(df_person_dedup_proba['person_id'])
    df_person_dedup_proba = df_person_dedup_proba.drop_duplicates(['unique_person_id'], keep='first')
    return df_person_dedup_proba

df_person_dedup_proba_90 = deduplicate_proba(df_person_clean, df_dedup_proba, score=0.90)
df_person_dedup_proba_20 = deduplicate_proba(df_person_clean, df_dedup_proba, score=0.20)

In [None]:
print("Original patient data:")
print(f"Unique patients: {df_person_clean['person_id'].nunique()}")

print('\nDeterministic deduplication:')
print(f"Unique patients: {df_person_dedup_det['unique_person_id'].nunique()}")

print('\nProbabilistic deduplication (threshold 0.90):')
print(f"Unique patients: {df_person_dedup_proba_90['unique_person_id'].nunique()}")

print('\nProbabilistic deduplication (threshold 0.20):')
print(f"Unique patients: {df_person_dedup_proba_20['unique_person_id'].nunique()}")

Original patient data:
Unique patients: 975

Deterministic deduplication:
Unique patients: 967

Probabilistic deduplication (threshold 0.90):
Unique patients: 959

Probabilistic deduplication (threshold 0.20):
Unique patients: 953


In [44]:
df_person_dedup_proba_90.to_pickle(os.path.join(output_dir, 'df_person.pkl'))

df_person_dedup = df_person_dedup_proba_90

In [45]:
def update_with_unique_id(df_visit, df_person_dedup):
    df_updated = pd.merge(
        df_visit,
        df_person_dedup[['person_id', 'unique_person_id']],
        on='person_id',
        how='left'
    )

    df_updated['unique_person_id'] = df_updated['unique_person_id'].fillna(df_updated['person_id'])
    return df_updated

df_visit_dedup = update_with_unique_id(df_visit, df_person_dedup)
df_condition_dedup = update_with_unique_id(df_visit, df_person_dedup)

In [46]:
def finalize_person_id(df):
    df = df.drop(columns=['person_id'], errors='ignore')
    df = df.rename(columns={'unique_person_id': 'person_id'})
    return df

df_person_final = finalize_person_id(df_person_dedup)
df_visit_final = finalize_person_id(df_visit_dedup)
df_condition_final = finalize_person_id(df_condition_dedup)

df_person_final.to_pickle(os.path.join(output_dir, 'df_person.pkl'))
df_visit_final.to_pickle(os.path.join(output_dir, 'df_visit.pkl'))
df_condition_final.to_pickle(os.path.join(output_dir, 'df_condition.pkl'))

In [47]:
df_visit = pd.read_pickle(os.path.join(output_dir, 'df_visit.pkl'))
df_visit = df_visit[df_visit['person_id'].isin(df_person_final['person_id'])]
pd.to_pickle(df_visit, os.path.join(output_dir, 'df_visit.pkl'))

df_note = pd.read_pickle(os.path.join(output_dir, 'df_note.pkl'))
df_note = df_note[df_note['visit_occurrence_id'].isin(df_visit['visit_occurrence_id'])]
pd.to_pickle(df_note, os.path.join(output_dir, 'df_note.pkl'))

df_condition = pd.read_pickle(os.path.join(output_dir, 'df_condition.pkl'))
df_condition = df_condition[df_condition['person_id'].isin(df_person_final['person_id'])]
pd.to_pickle(df_condition, os.path.join(output_dir, 'df_condition.pkl'))

df_bio = pd.read_pickle(os.path.join(output_dir, 'df_bio.pkl'))
df_bio = df_bio[df_bio['visit_occurrence_id'].isin(df_visit['visit_occurrence_id'])]
pd.to_pickle(df_bio, os.path.join(output_dir, 'df_bio.pkl'))
