In [1]:
# Import libraries
import pandas as pd
import json
import ast
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
train_patients_path = 'data/DDXPlus/release_train_patients.csv'
test_patients_path = 'data/DDXPlus/release_test_patients.csv'
validate_patients_path = 'data/DDXPlus/release_validate_patients.csv'

condition_info_path = 'data/DDXPlus/release_conditions.json'
evidence_info_path = 'data/DDXPlus/release_evidences.json'

train_processed_sample = 'data/processed/train_processed_sample1.csv'
train_processed_target = 'data/processed/train_processed_target.csv'
train_processed_diff = 'data/processed/train_processed_differential_diagnosis.csv'

test_processed_sample = 'data/processed/test_processed_sample1.csv'
test_processed_target = 'data/processed/test_processed_target.csv'
test_processed_diff = 'data/processed/test_processed_differential_diagnosis.csv'

validate_processed_sample = 'data/processed/validate_processed_sample1.csv'
validate_processed_target = 'data/processed/validate_processed_target.csv'
validate_processed_diff = 'data/processed/validate_processed_differential_diagnosis.csv'

In [3]:
with open(evidence_info_path, 'r') as json_file:
    evidence_info = json.load(json_file)

with open(condition_info_path, 'r') as json_file:
    condition_info = json.load(json_file)

with open('data/encoding/pathology_encoding.json', 'r') as json_file:
    pathology_encoding = json.load(json_file)
pathology_inverse_encoding = {v: int(k) for k, v in pathology_encoding.items()}

In [4]:
list_of_evidence = []
for evidence, info in evidence_info.items():
    if len(info["possible-values"]) == 0:
        evidence_name = evidence
        list_of_evidence.append(evidence_name)
    else:
        for value in info["possible-values"]:
            print(evidence, value)
            evidence_name = evidence + "_@_" + str(value)
            list_of_evidence.append(evidence_name)

douleurxx_endroitducorps nulle_part
douleurxx_endroitducorps aile_iliaque_D_
douleurxx_endroitducorps aile_iliaque_G_
douleurxx_endroitducorps aine_D_
douleurxx_endroitducorps aine_G_
douleurxx_endroitducorps aisselle_D_
douleurxx_endroitducorps aisselle_G_
douleurxx_endroitducorps amygdale_D_
douleurxx_endroitducorps amygdale_G_
douleurxx_endroitducorps anus
douleurxx_endroitducorps arrière_de_la_cheville_D_
douleurxx_endroitducorps arrière_de_la_cheville_G_
douleurxx_endroitducorps arrière_de_tête
douleurxx_endroitducorps arrière_du_cou
douleurxx_endroitducorps avant-bras_D_
douleurxx_endroitducorps avant-bras_G_
douleurxx_endroitducorps bas_du_thorax
douleurxx_endroitducorps biceps_D_
douleurxx_endroitducorps biceps_G_
douleurxx_endroitducorps bouche
douleurxx_endroitducorps cartilage_thyroidien
douleurxx_endroitducorps cheville_D_
douleurxx_endroitducorps cheville_G_
douleurxx_endroitducorps clitoris
douleurxx_endroitducorps coccyx
douleurxx_endroitducorps colonne_cervicale
douleur

In [5]:
def extract_diff(df_path, diff_ouput):
    df = pd.read_csv(df_path)
    df = df.drop(['AGE', 'SEX', 'EVIDENCES', 'PATHOLOGY', 'INITIAL_EVIDENCE'], axis=1)

    # Transform the DIFFERENTIAL_DIAGNOSIS into a multi-output format
    # Extract all possible pathologies from the DIFFERENTIAL_DIAGNOSIS column
    df['DIFFERENTIAL_DIAGNOSIS'] = df['DIFFERENTIAL_DIAGNOSIS'].apply(ast.literal_eval)
    all_pathologies = list(condition_info.keys())

    # Create a DataFrame for the target with one column per pathology, initialized to 0
    diffrential_diagnosis = pd.DataFrame(0.0, index=df.index, columns=range(49))

    # Populate the target DataFrame with the probabilities from DIFFERENTIAL_DIAGNOSIS
    for idx, diag in enumerate(df['DIFFERENTIAL_DIAGNOSIS']):
        for patho, proba in diag:
            diffrential_diagnosis.at[idx, pathology_inverse_encoding[patho]] = proba

    return diffrential_diagnosis

def extract_sample(df_path, sample_output):
    df = pd.read_csv(df_path)
    df = df.drop(['PATHOLOGY', 'INITIAL_EVIDENCE', 'DIFFERENTIAL_DIAGNOSIS'], axis=1)

    # Make SEX binary
    df['SEX'] = df['SEX'].map({'M': 0, 'F': 1})
    
    # Create a DataFrame with all evidence columns initialized to 0
    X = pd.DataFrame(0, index=df.index, columns=list_of_evidence)

    # Populate the DataFrame by setting appropriate evidence to 1 where they exist in the row's list
    for evidence in list_of_evidence:
        X[evidence] = df['EVIDENCES'].apply(lambda x: 1 if evidence in x else 0)

    # Concatenate the evidence DataFrame with the original DataFrame
    X = pd.concat([df.drop(['EVIDENCES'], axis=1), X], axis=1)

    return X

def extract_target(df_path, target_output, encoding={}):
    df = pd.read_csv(df_path)
    if encoding == {}:
        y = pd.DataFrame(0, index=df.index, columns=['PATHOLOGY'])
        label_encoder = LabelEncoder()
        y['PATHOLOGY'] = label_encoder.fit_transform(df['PATHOLOGY'])
        y.to_csv(target_output, index=False)

        number_of_classes = len(label_encoder.classes_)
        for i in range(number_of_classes):
            encoding[i] = label_encoder.inverse_transform([i])[0]
        with open('data/encoding/pathology_encoding.json', 'w') as file:
            json.dump(encoding, file)
    else:
        y = pd.DataFrame(0, index=df.index, columns=['PATHOLOGY'])
        print(y)
        y['PATHOLOGY'] = df['PATHOLOGY'].map(encoding)
        return y

def 

In [6]:
# extract_target(train_patients_path, train_processed_target, pathology_inverse_encoding)
# extract_sample(train_patients_path, train_processed_sample)
extract_diff(train_patients_path, train_processed_diff)

In [7]:
# extract_target(test_patients_path, test_processed_target, pathology_inverse_encoding)
# extract_sample(test_patients_path, test_processed_sample)
extract_diff(test_patients_path, test_processed_diff)

In [8]:
# extract_target(validate_patients_path, validate_processed_target, pathology_inverse_encoding)
# extract_sample(validate_patients_path, validate_processed_sample)
extract_diff(validate_patients_path, validate_processed_diff)