# Data Harmonisation for Cohort Comparison Table

This notebook merges and concatenates data from the 3 cohorts for export into a unified dataframe. Subsequently processed in R's gtsummary package. The first table output provides the table structure. The second table provides the p-values for direct comparison between cohorts 1 and 2 (MUSIC and GIDAMPs). Montreal chi square testing is computed manually as the denominator is not the same for each subgroup (UC and CD).

In [1]:
import pandas as pd
from src.data.loading import load_fatigue_dataset, load_online_survey_dataset
from src.config.paths import DEMOGRAPHICS_DIR, ensure_output_dirs


In [2]:
df = load_fatigue_dataset()

# remove redcap_event_name where values are timepoint_2, timepoint_3, timepoint_4, timepoint_5
df = df[
    ~df["redcap_event_name"].isin(
        ["timepoint_2", "timepoint_3", "timepoint_4", "timepoint_5"]
    )
]

df_music_demographics = pd.read_csv("data/music_demographics_111224.csv")
df_gidamps_demographics = pd.read_csv("data/gidamps_demographics_111224.csv")


In [3]:
df_gidamps_demographics["ifx"] = df_gidamps_demographics["ifx"].map({1: 1, 2: 0})
df_gidamps_demographics["ciclo"] = df_gidamps_demographics["ciclo"].map({1: 1, 2: 0})

In [4]:
# Get previous drug therapy from demographics and merge onto fatigue dataset

cols_from_music = [
    "study_id",
    "baseline_aza",
    "baseline_mp",
    "baseline_mtx",
    "baseline_asa",
    "baseline_ifx",
    "baseline_ada",
    "baseline_goli",
    "baseline_vedo",
    "baseline_uste",
    "baseline_risa",
    "baseline_tofa",
    "baseline_filgo",
]

cols_from_gidamps = [
    "study_id",
    "aza",
    "mp",
    "mtx",
    "ifx",
    "ada",
    "golim",
    "vedo",
    "uste",
    "risa",
    "tofa",
    "filgo",
    "upa",
]

df_music = df_music_demographics[cols_from_music]
df_gidamps = df_gidamps_demographics[cols_from_gidamps]

df_gidamps.rename(
    columns={
        "aza": "baseline_aza",
        "mp": "baseline_mp",
        "mtx": "baseline_mtx",
        "ifx": "baseline_ifx",
        "ada": "baseline_ada",
        "golim": "baseline_goli",
        "vedo": "baseline_vedo",
        "uste": "baseline_uste",
        "risa": "baseline_risa",
        "tofa": "baseline_tofa",
        "filgo": "baseline_filgo",
        "upa": "baseline_upa",
    },
    inplace=True,
)

df_music["baseline_upa"] = 0

df_previous_drug_therapy = pd.concat([df_music, df_gidamps], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gidamps.rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_music["baseline_upa"] = 0


In [5]:
# Merge df_previous_drug_therapy onto df
df = df.merge(df_previous_drug_therapy, on="study_id", how="left")

In [6]:
# Calculate year of diagnosis
df["date_of_diagnosis"] = pd.to_datetime(df["date_of_diagnosis"], errors="coerce")
df["cucq_date"] = pd.to_datetime(df["cucq_date"], errors="coerce")

df["year_of_diagnosis"] = df["date_of_diagnosis"].dt.year

# Calculate disease_duration_weeks
df["disease_duration_weeks"] = (df["cucq_date"] - df["date_of_diagnosis"]).dt.days // 7

### Add in Cohort 3 data

In [7]:
# Cohort 3 Online Survey
df_online = load_online_survey_dataset()

cols_to_keep = [
    "age",
    "participant_location",
    "study_group",
    "self_reported_disease_activity",
    "cucq_5",
    "cucq_total",
]

df_online = df_online[cols_to_keep]
# Rename columns to match df
df_online.rename(
    columns={
        "study_group": "study_group_name",
    },
    inplace=True,
)
df_online["study"] = "ONLINE"

# Make study_group_name values uppercase
df_online["study_group_name"] = df_online["study_group_name"].str.upper()

# Concatenate df and df_online
df_combined = pd.concat([df, df_online], ignore_index=True)

In [8]:
# Remove unneeded columns from df_combined

cols_to_remove = [
    "study_id",
    "redcap_event_name",
    "date_of_diagnosis",
    "cucq_date",
    "baseline_eims_arthralgia_arthritis",
    "baseline_eims_ankylosing_spondylitis",
    "baseline_eims_erythema_nodosum",
    "baseline_eims_pyoderma_gangrenosum",
    "baseline_eims_uveitis",
    "baseline_eims_scleritis_episclerities",
    "urea",
    "creatinine",
    "sodium",
    "potassium",
    "haematocrit",
    "neutrophils",
    "lymphocytes",
    "monocytes",
    "eosinophils",
    "basophils",
    "ada_drug_level",
    "ifx_drug_level",
    "ifx_antibody_present",
    "ada_antibody_present",
    "cucq_1",
    "cucq_2",
    "cucq_3",
    "cucq_4",
    "cucq_6",
    "cucq_7",
    "cucq_8",
    "cucq_9",
    "cucq_10",
    "cucq_11",
    "cucq_12",
    "cucq_13",
    "cucq_14",
    "cucq_15",
    "cucq_16",
    "cucq_17",
    "cucq_18",
    "cucq_19",
    "cucq_20",
    "cucq_21",
    "cucq_22",
    "cucq_23",
    "cucq_24",
    "cucq_25",
    "cucq_26",
    "cucq_27",
    "cucq_28",
    "cucq_29",
    "cucq_30",
    "cucq_31",
    "cucq_32",
]

df_combined.drop(columns=cols_to_remove, inplace=True, errors="ignore")

In [9]:
# Merge L3 Ileocoloni and L3 Ileocolonic in montreal_cd_location
df_combined["montreal_cd_location"] = df_combined["montreal_cd_location"].replace(
    {"L3 Ileocoloni": "L3 Ileocolonic"}
)

In [10]:
# Fatigue outcome = 1 if cucq_5 >= 10

df_combined["fatigue_outcome"] = df_combined["cucq_5"].apply(
    lambda x: 1 if x >= 10 else 0
)

In [11]:
# Fill NA with 0 for these columns
cols_to_fill_na = [
    "baseline_aza",
    "baseline_mp",
    "baseline_mtx",
    "baseline_asa",
    "baseline_ifx",
    "baseline_ada",
    "baseline_goli",
    "baseline_vedo",
    "baseline_uste",
    "baseline_risa",
    "baseline_tofa",
    "baseline_filgo",
    "baseline_upa",
    "sampling_steroids",
    "sampling_abx",
    "sampling_asa",
    "sampling_aza",
    "sampling_mp",
    "sampling_ifx",
    "sampling_ada",
    "sampling_vedo",
    "sampling_uste",
    "sampling_tofa",
    "sampling_mtx",
    "sampling_ciclosporin",
    "sampling_filgo",
    "sampling_upa",
    "sampling_risa",
    "has_active_symptoms",
]

df_combined[cols_to_fill_na] = df_combined[cols_to_fill_na].fillna(0)

### Save to csv for R analysis

In [12]:
ensure_output_dirs()
output_path = DEMOGRAPHICS_DIR / "all_cohorts_demographics.csv"
df_combined.to_csv(output_path, index=False)