## Transform Mini-MUSIC data to same shape for DNN input

In [1]:
import pandas as pd

from utils.data import load_and_preprocess_data

In [2]:
df = pd.read_csv("raw_data/mini_music_main_2025-02-06.csv")

In [3]:
print(f"Number of patients: {df['study_id'].nunique()}")


Number of patients: 53


In [4]:
columns_to_keep = [
    "study_id",
    "redcap_event_name",
    "study_group",
    "date_of_diagnosis",
    "age",
    "sex",
    "height",
    "weight",
    "bmi",
    "sampling_asa",
    "sampling_steroids_oral",
    "sampling_steroids_iv",
    "sampling_steroids_topical",
    "sampling_imm",
    "sampling_mtx",
    "sampling_ifx",
    "sampling_ada",
    "sampling_vedo",
    "sampling_uste",
    "albumin",
    "crp",
    "haemoglobin",
    "white_cell_count",
    "neutrophils",
    "lymphocytes",
    "monocytes",
    "eosinophils",
    "basophils",
    "platelets",
    "urea",
    "creatinine",
    "sodium",
    "potassium",
    "calprotectin",
    "cdparis_upper_gi",
    "cdparis_behaviour",
    "cdparis_location",
    "cdparis_perianal",
    "ucparis_severity",
    "ucparis_extent",
    "sample_date",
    "fatigue_tscore",  # fatigue outcome = 1 if tscore >50
]

df = df[columns_to_keep]

In [5]:
# List of columns to fill
columns_to_fill = [
    "study_group",
    "date_of_diagnosis",
    "age",
    "sex",
    "cdparis_upper_gi",
    "cdparis_behaviour",
    "cdparis_location",
    "cdparis_perianal",
    "ucparis_severity",
    "ucparis_extent",
]

# Extract the timepoint_1 info for the specified columns for each study_id
tp1_info = df[df["redcap_event_name"] == "timepoint_1"][["study_id"] + columns_to_fill]
tp1_info = tp1_info.rename(columns={col: f"tp1_{col}" for col in columns_to_fill})

# Merge the timepoint_1 values onto all rows based on study_id
df = df.merge(tp1_info, on="study_id", how="left")

# For rows that are not timepoint_1, update the columns with the corresponding timepoint_1 values
mask = df["redcap_event_name"] != "timepoint_1"
for col in columns_to_fill:
    df.loc[mask, col] = df.loc[mask, f"tp1_{col}"]

# Clean up the extra tp1 columns
df.drop([f"tp1_{col}" for col in columns_to_fill], axis=1, inplace=True)


In [6]:
df["diagnosis_year"] = pd.to_datetime(df["date_of_diagnosis"], errors="coerce").dt.year

In [7]:
df["age_at_diagnosis"] = df["age"] - (
    (
        pd.to_datetime(df["sample_date"]) - pd.to_datetime(df["date_of_diagnosis"])
    ).dt.days
    / 365.25
)

In [8]:
delta_days = (
    pd.to_datetime(df["sample_date"])
    - pd.to_datetime(df["date_of_diagnosis"], errors="coerce")
).dt.days
delta_days = delta_days.clip(lower=0)  # Replace negative differences with 0
df["disease_duration_weeks"] = delta_days / 7


In [9]:
df["is_smoker_Ex-smoker"] = 0
df["is_smoker_Smoker"] = 0
df["is_smoker_Non-smoker"] = 1

In [10]:
df["fatigue_outcome"] = (df["fatigue_tscore"] > 50).astype(int)

df.drop(columns=["fatigue_tscore"], inplace=True)
df.dropna(subset=["fatigue_outcome"], inplace=True)

In [11]:
# Rename cdparis_upper_gi to montreal_upper_gi with a binary transformation:
# Assign 1 if the value is "L4a" or "L4b", otherwise 0.
df["montreal_upper_gi"] = df["cdparis_upper_gi"].apply(
    lambda x: 1 if x in ["L4a", "L4b"] else 0
)
df["montreal_perianal"] = df["cdparis_perianal"].apply(lambda x: 1 if x == "yes" else 0)
df.drop(columns=["cdparis_upper_gi", "cdparis_perianal"], inplace=True)

In [12]:
df["montreal_cd_location_L1 Ileal"] = (df["cdparis_location"] == "L1").astype(int)
df["montreal_cd_location_L2 Colonic"] = (df["cdparis_location"] == "L2").astype(int)
df["montreal_cd_location_L3 Ileocolonic"] = (df["cdparis_location"] == "L3").astype(int)
df["montreal_cd_behaviour_B1 Non-stricturing, non-penetrating"] = (
    df["cdparis_behaviour"] == "B1"
).astype(int)
df["montreal_cd_behaviour_B2 Stricturing"] = (df["cdparis_behaviour"] == "B2").astype(
    int
)
df["montreal_cd_behaviour_B3 Penetrating"] = (df["cdparis_behaviour"] == "B3").astype(
    int
)
df["montreal_uc_extent_E1 Proctitis"] = (df["ucparis_extent"] == "E1").astype(int)
df["montreal_uc_extent_E2 Left-sided"] = (df["ucparis_extent"] == "E2").astype(int)
df["montreal_uc_extent_E3 Extensive"] = (
    df["ucparis_extent"].isin(["E3", "E4"]).astype(int)
)
df["montreal_uc_severity_S1 Mild"] = (df["ucparis_severity"] == "S0").astype(int)
df["montreal_uc_severity_S3 Severe"] = (df["ucparis_severity"] == "S1").astype(int)
df["montreal_uc_severity_S2 Moderate"] = 0
df["montreal_uc_severity_S0 Remission"] = 0

df.drop(
    columns=[
        "cdparis_location",
        "cdparis_behaviour",
        "ucparis_extent",
        "ucparis_severity",
    ],
    inplace=True,
)


In [13]:
columns_to_fill_with_zero = [
    "sampling_abx",
    "sampling_ciclosporin",
    "sampling_filgo",
    "sampling_mp",
    "sampling_risa",
    "sampling_tofa",
    "sampling_upa",
]

for col in columns_to_fill_with_zero:
    df[col] = 0

In [14]:
df = df[df["study_group"] != "non_ibd"]

In [15]:
df["study_group_name_CD"] = (df["study_group"] == "cd").astype(int)
df["study_group_name_UC"] = (df["study_group"] == "uc").astype(int)
df["study_group_name_IBDU"] = (df["study_group"] == "ibdu").astype(int)

df.drop(columns=["study_group"], inplace=True)

In [16]:
df.rename(columns={"sampling_imm": "sampling_aza"}, inplace=True)

In [17]:
df["sampling_steroids"] = (
    (df["sampling_steroids_oral"] == 1)
    | (df["sampling_steroids_iv"] == 1)
    | (df["sampling_steroids_topical"] == 1)
).astype(int)

df.drop(
    columns=[
        "sampling_steroids_oral",
        "sampling_steroids_iv",
        "sampling_steroids_topical",
    ],
    inplace=True,
)

In [18]:
# Convert sample_date to datetime if not already
df["sample_date"] = pd.to_datetime(df["sample_date"], errors="coerce")


# Define a function to determine the season from a month
def get_season(month):
    if month in (12, 1, 2):
        return "winter"
    elif month in (3, 4, 5):
        return "spring"
    elif month in (6, 7, 8):
        return "summer"
    elif month in (9, 10, 11):
        return "autumn"
    return None


# Calculate the season for each row and store the result in a new column
df["season"] = df["sample_date"].dt.month.apply(get_season)

# One-hot encode the season column and ensure the dummy columns use 1 and 0
df = pd.get_dummies(df, columns=["season"], drop_first=True, dtype=int)

df["season_autumn"] = 0

In [19]:
df.drop(
    columns=["sample_date", "study_id", "date_of_diagnosis", "redcap_event_name"],
    inplace=True,
)

In [20]:
training_df = load_and_preprocess_data()
df["red_cell_count"] = training_df[
    "red_cell_count"
].median()  # impute missing red_cell_count with median

In [21]:
target_df = pd.read_csv("output/dnn/export/X_test.csv")

In [22]:
# Reorder df columns to follow the same order as target_df,
# then append 'fatigue_outcome' at the end if it exists in df.

# Get the common columns in the order of target_df
common_order = [col for col in target_df.columns if col in df.columns]

# Append 'fatigue_outcome' if it's not already in the common_order
if "fatigue_outcome" in df.columns and "fatigue_outcome" not in common_order:
    common_order.append("fatigue_outcome")

df = df[common_order]

In [23]:
df["calprotectin"] = pd.to_numeric(df["calprotectin"], errors="coerce")

In [24]:
columns_to_fill_with_median = [
    "age",
    "height",
    "weight",
    "bmi",
    "age_at_diagnosis",
    "albumin",
    "crp",
    "haemoglobin",
    "white_cell_count",
    "neutrophils",
    "lymphocytes",
    "monocytes",
    "eosinophils",
    "basophils",
    "platelets",
    "urea",
    "creatinine",
    "sodium",
    "potassium",
    "calprotectin",
    "disease_duration_weeks",
    "diagnosis_year",
]

for column in columns_to_fill_with_median:
    df[column] = df[column].fillna(df[column].median())

In [25]:
df["sex"] = df["sex"].map({"male": 1, "female": 0})

In [26]:
common_columns = sorted(set(df.columns) & set(target_df.columns))
only_in_filtered = sorted(set(df.columns) - set(target_df.columns))
only_in_target = sorted(set(target_df.columns) - set(df.columns))

print("Columns only in filtered_df:")
print(only_in_filtered)
print("\nColumns only in target_df:")
print(only_in_target)

Columns only in filtered_df:
['fatigue_outcome']

Columns only in target_df:
[]


In [27]:
df.to_csv("data/minimusic_cleaned_validation.csv", index=False)

In [28]:
print(f"Number of rows: {df.shape[0]}")

Number of rows: 79
