In [1]:
# Import libraries
import pandas as pd
import json
import ast
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
train_patients_path = 'data/DDXPlus/release_train_patients.csv'
test_patients_path = 'data/DDXPlus/release_test_patients.csv'
validate_patients_path = 'data/DDXPlus/release_validate_patients.csv'

condition_info_path = 'data/DDXPlus/release_conditions.json'
evidence_info_path = 'data/DDXPlus/release_evidences.json'

In [3]:
train_df = pd.read_csv(train_patients_path)
train_df.shape

(1025602, 6)

In [4]:
with open(condition_info_path, 'r') as file:
    condition_info = json.load(file)

print(condition_info)

{'Pneumothorax spontané': {'condition_name': 'Pneumothorax spontané', 'cond-name-fr': 'Pneumothorax spontané', 'cond-name-eng': 'Spontaneous pneumothorax', 'icd10-id': 'J93', 'symptoms': {'douleurxx_endroitducorps': {}, 'douleurxx': {}, 'douleurxx_irrad': {}, 'douleurxx_carac': {}, 'douleurxx_soudain': {}, 'douleurxx_intens': {}, 'douleurxx_precis': {}, 'dyspn': {}, 'ww_respi': {}, 'ww_effort': {}, 'angor_repos': {}, 'oedeme': {}}, 'antecedents': {'f17.210': {}, 'pneumothorax': {}, 'ap_pneumothorax': {}, 'j44_j42': {}, 'trav1': {}}, 'severity': 2}, 'Céphalée en grappe': {'condition_name': 'Céphalée en grappe', 'cond-name-fr': 'Céphalée en grappe', 'cond-name-eng': 'Cluster headache', 'icd10-id': 'g44.009', 'symptoms': {'douleurxx_endroitducorps': {}, 'douleurxx': {}, 'douleurxx_irrad': {}, 'douleurxx_carac': {}, 'douleurxx_soudain': {}, 'douleurxx_intens': {}, 'douleurxx_precis': {}, 'larmes': {}, 'rhino_clair': {}, 'rds_paralys_gen': {}}, 'antecedents': {'atcd_cluster': {}, 'f10.129':

In [5]:
# Encode SEX
train_df['SEX'] = train_df['SEX'].map({'M': 0, 'F': 1})

# Transform the DIFFERENTIAL_DIAGNOSIS into a multi-output format
# Extract all possible pathologies from the DIFFERENTIAL_DIAGNOSIS column
train_df['DIFFERENTIAL_DIAGNOSIS'] = train_df['DIFFERENTIAL_DIAGNOSIS'].apply(ast.literal_eval)
all_pathologies = list(set(patho for diag in train_df['DIFFERENTIAL_DIAGNOSIS'] for patho, _ in diag))

# Create a DataFrame for the target with one column per pathology, initialized to 0
target_df = pd.DataFrame(0.0, index=train_df.index, columns=all_pathologies)

# Populate the target DataFrame with the probabilities from DIFFERENTIAL_DIAGNOSIS
for idx, diag in enumerate(train_df['DIFFERENTIAL_DIAGNOSIS']):
    for patho, proba in diag:
        target_df.at[idx, patho] = proba

# Drop the original DIFFERENTIAL_DIAGNOSIS column as it's now encoded in target_df
train_df = train_df.drop(['DIFFERENTIAL_DIAGNOSIS'], axis=1)

# Turn string into list
train_df['EVIDENCES_LIST'] = train_df['EVIDENCES'].str.strip('[]').replace("'", "").str.split(', ')

# Get all unique evidence in table
all_evidences = list(set(evidence.replace("'", "") for sublist in train_df['EVIDENCES_LIST'] for evidence in sublist))

# Create a DataFrame with all evidence columns initialized to 0
evidence_df = pd.DataFrame(0, index=train_df.index, columns=all_evidences)

# Populate the DataFrame by setting appropriate evidence to 1 where they exist in the row's list
for evidence in all_evidences:
    evidence_df[evidence] = train_df['EVIDENCES_LIST'].apply(lambda x: 1 if evidence in x else 0)

# Concatenate the evidence DataFrame with the original DataFrame
train_df = pd.concat([train_df.drop(['EVIDENCES_LIST'], axis=1), evidence_df], axis=1)

# Split the data into features (X) and targets (y)
X = train_df.drop(columns=['PATHOLOGY', 'EVIDENCES', 'INITIAL_EVIDENCE'])  # Drop PATHOLOGY as it's the ground truth label, not a feature
y = target_df

In [6]:
train_df.head()

Unnamed: 0,AGE,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,fievre,douleurxx_endroitducorps_@_côté_du_thorax_D_,bw_bending,rx_vasodil,lesions_peau_prurit_@_1,...,oedeme_endroitducorps_@_cheville_D_,wheez,douleurxx_endroitducorps_@_haut_du_thorax,ballon_abdo,c00-d48,trav1_@_AsieSSE,oedeme_endroitducorps_@_joue_D_,lymphoedème,lesions_peau_intens_@_0,dysarthrie
0,18,0,IVRS ou virémie,"['crowd', 'diaph', 'douleurxx', 'douleurxx_car...",fievre,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,VIH (Primo-infection),"['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '...",diaph,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,19,1,Pneumonie,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",expecto,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,34,1,IVRS ou virémie,"['crowd', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,36,0,IVRS ou virémie,"['dayc', 'diaph', 'douleurxx', 'douleurxx_cara...",toux,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X.head()
X.to_csv('train_sample.csv')

In [8]:
y.head()
y.to_csv('train_target.csv')