In [11]:
# Import libraries
import pandas as pd
import json
import ast
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [12]:
train_patients_path = 'data/DDXPlus/release_train_patients.csv'
test_patients_path = 'data/DDXPlus/release_test_patients.csv'
validate_patients_path = 'data/DDXPlus/release_validate_patients.csv'

condition_info_path = 'data/DDXPlus/release_conditions.json'
evidence_info_path = 'data/DDXPlus/release_evidences.json'

train_processed_sample = 'data/processed/train_processed_sample.csv'
train_processed_target = 'data/processed/train_processed_target.csv'
train_processed_diff = 'data/processed/train_processed_differential_diagnosis.csv'

test_processed_sample = 'data/processed/test_processed_sample.csv'
test_processed_target = 'data/processed/test_processed_target.csv'
test_processed_diff = 'data/processed/test_processed_differential_diagnosis.csv'

validate_processed_sample = 'data/processed/validate_processed_sample.csv'
validate_processed_target = 'data/processed/validate_processed_target.csv'
validate_processed_diff = 'data/processed/validate_processed_differential_diagnosis.csv'

In [13]:
with open(evidence_info_path, 'r') as json_file:
    evidence_info = json.load(json_file)


with open(condition_info_path, 'r') as json_file:
    condition_info = json.load(json_file)

with open('data/encoding/pathology_encoding.json', 'r') as json_file:
    pathology_encoding = json.load(json_file)
pathology_inverse_encoding = {v: int(k) for k, v in pathology_encoding.items()}

In [14]:
def extract_diff(df, diff_ouput):
    df = df.drop(['AGE', 'SEX', 'EVIDENCES', 'PATHOLOGY', 'INITIAL_EVIDENCE'], axis=1)

    # Transform the DIFFERENTIAL_DIAGNOSIS into a multi-output format
    # Extract all possible pathologies from the DIFFERENTIAL_DIAGNOSIS column
    df['DIFFERENTIAL_DIAGNOSIS'] = df['DIFFERENTIAL_DIAGNOSIS'].apply(ast.literal_eval)
    all_pathologies = list(condition_info.keys())

    # Create a DataFrame for the target with one column per pathology, initialized to 0
    diffrential_diagnosis = pd.DataFrame(0.0, index=df.index, columns=all_pathologies)

    # Populate the target DataFrame with the probabilities from DIFFERENTIAL_DIAGNOSIS
    for idx, diag in enumerate(df['DIFFERENTIAL_DIAGNOSIS']):
        for patho, proba in diag:
            diffrential_diagnosis.at[idx, patho] = proba

    diffrential_diagnosis.to_csv(diff_ouput, index=False)

def extract_sample(df, sample_output):
    df = df.drop(['PATHOLOGY', 'INITIAL_EVIDENCE', 'DIFFERENTIAL_DIAGNOSIS'], axis=1)

    # Make SEX binary
    df['SEX'] = df['SEX'].map({'Male': 0, 'Female': 1})

    df['EVIDENCES'] = df['EVIDENCES'].str.strip('[]').replace("'", "").str.split(', ')
    # Get all unique evidence in table
    all_evidences = list(evidence_info.keys())

    # Create a DataFrame with all evidence columns initialized to 0
    X = pd.DataFrame(0, index=df.index, columns=all_evidences)

    # Populate the DataFrame by setting appropriate evidence to 1 where they exist in the row's list
    for evidence in all_evidences:
        X[evidence] = df['EVIDENCES'].apply(lambda x: 1 if evidence in x else 0)

    # Concatenate the evidence DataFrame with the original DataFrame
    X = pd.concat([df.drop(['EVIDENCES'], axis=1), X], axis=1)

    X.to_csv(sample_output, index=False)

def extract_target(df, target_output, encoding={}):
    if encoding == {}:
        y = pd.DataFrame(0, index=df.index, columns=['PATHOLOGY'])
        label_encoder = LabelEncoder()
        y['PATHOLOGY'] = label_encoder.fit_transform(df['PATHOLOGY'])
        y.to_csv(target_output, index=False)

        number_of_classes = len(label_encoder.classes_)
        for i in range(number_of_classes):
            encoding[i] = label_encoder.inverse_transform([i])[0]
        with open('data/encoding/pathology_encoding.json', 'w') as file:
            json.dump(encoding, file)
    else:
        y = pd.DataFrame(0, index=df.index, columns=['PATHOLOGY'])
        y['PATHOLOGY'] = df['PATHOLOGY'].map(encoding)
        y.to_csv(target_output, index=False)


In [15]:
train_df = pd.read_csv(train_patients_path)
extract_target(train_df, train_processed_target, pathology_inverse_encoding)
extract_sample(train_df, train_processed_sample)
extract_diff(train_df, train_processed_diff)
del train_df

In [16]:
test_df = pd.read_csv(test_patients_path)
extract_target(test_df, test_processed_target, pathology_inverse_encoding)
extract_sample(test_df, test_processed_sample)
extract_diff(test_df, test_processed_diff)
del test_df

In [17]:
validate_df = pd.read_csv(validate_patients_path)
extract_target(validate_df, validate_processed_target, pathology_inverse_encoding)
extract_sample(validate_df, validate_processed_sample)
extract_diff(validate_df, validate_processed_diff)
del validate_df