In [1]:
import pandas as pd

df_music = pd.read_csv("raw_data/music_main_2025-04-30.csv")
df_gidamps = pd.read_csv("raw_data/gidamps_sampling_2025-04-30.csv")

In [2]:
df_music['cucq_date']# Convert cucq_date to datetime format first
df_music['cucq_date'] = pd.to_datetime(df_music['cucq_date']) 

# Filter for dates after Jan 1st 2025
cutoff_date = pd.to_datetime('2025-01-01')

music_future_dates = df_music[df_music['cucq_date'] > cutoff_date].shape[0]

# Convert sampling_date to datetime if not already done
df_gidamps['sampling_date'] = pd.to_datetime(df_gidamps['sampling_date'])

# Filter for non-NA cucq_5 and dates after cutoff
cutoff_date = pd.to_datetime('2025-01-01')
filtered_gidamps = df_gidamps[
    (df_gidamps['cucq_5'].notna()) & 
    (df_gidamps['sampling_date'] > cutoff_date)
]

print(f"Number of GIDAMPs rows with non-NA cucq_5 after Jan 1st 2025: {len(filtered_gidamps)}")

# Optional: Display first few rows to verify
print("\nFirst few rows of filtered data:")
print(filtered_gidamps[['sampling_date', 'cucq_5']].head())

print(f"Number of MUSIC rows after Jan 1st 2025: {music_future_dates}")

Number of GIDAMPs rows with non-NA cucq_5 after Jan 1st 2025: 1

First few rows of filtered data:
     sampling_date  cucq_5
1091    2025-02-03    14.0
Number of MUSIC rows after Jan 1st 2025: 38


In [3]:
df_music_existing = pd.read_csv("raw_data/gidamps_music_combined_fatigue_df_111224_with_cmh.csv")

# remove mid prefix from df_music['study_id']
df_music['study_id'] = df_music['study_id'].str.replace('MID-', '', regex=False)

In [4]:
music_future_dates = df_music[df_music['cucq_date'] > cutoff_date]

# check music_future_dates for study_ids in df_music_existing
existing_study_ids = df_music_existing['study_id'].unique()
music_future_study_ids = music_future_dates['study_id'].unique()
# Find study_ids in music_future_dates that are not in df_music_existing
missing_study_ids = set(music_future_study_ids) - set(existing_study_ids)
print(f"Number of unique study_ids in music_future_dates not in df_music_existing: {len(missing_study_ids)}")
print(f"Missing study_ids: {missing_study_ids}")

Number of unique study_ids in music_future_dates not in df_music_existing: 9
Missing study_ids: {'191-2', '91-54', '92-28', '90-121', '91-52', '92-35', '92-33', '92-26', '92-34'}


In [5]:
# subset music_futures_dates to only include the missing study_ids

music_future_dates_missing = music_future_dates[
    music_future_dates['study_id'].isin(missing_study_ids)
]

We could potentially add 10 more datapoints in from GI-DAMPs and MUSIC recruited after 2025-01-01.

In [None]:
# Extract cols from filtered_gidamps and music_future_dates_missing

cols_to_extract = [
    "cucq_5",
    "age",
    "sex", 
    "height",
    "weight",
    "smoking_status",
    "study_group",
    "date_of_diagnosis",
    "montreal_cd_location",
    "montreal_cd_behaviour",
    "montreal_upper_gi",
    "montreal_perianal",
    "montreal_uc_extent",
    "montreal_uc_severity",
    "has_active_symptoms",
    "haemoglobin",
    "red_cell_count",
    "white_cell_count",
    "neutrophils",
    "lymphocytes",
    "monocytes",
    "eosinophils",
    "basophils",
    "platelets",
    "urea",
    "creatinine",
    "sodium",
    "potassium",
    "crp",
    "albumin",
    "calprotectin",
    "sampling_steroids",
    "sampling_abx",
    "sampling_asa",
    "sampling_aza",
    "sampling_mp",
    "sampling_ifx",
    "sampling_ada",
    "sampling_vedo",
    "sampling_uste",
    "sampling_tofa",
    "sampling_mtx",
    "sampling_ciclosporin",
    "sampling_filgo",
    "sampling_upa",
    "sampling_risa"
]

In [9]:
# Check that filtered_gidamps has all the columns
missing_cols = [col for col in cols_to_extract if col not in filtered_gidamps.columns]
if missing_cols:
    print(f"Missing columns in filtered_gidamps: {missing_cols}")
else:
    print("All columns are present in filtered_gidamps.")

All columns are present in filtered_gidamps.


In [10]:
# Check that music_future_dates_missing has all the columns
missing_cols = [col for col in cols_to_extract if col not in music_future_dates_missing.columns]
if missing_cols:
    print(f"Missing columns in music_future_dates_missing: {missing_cols}")
else:
    print("All columns are present in music_future_dates_missing.")

All columns are present in music_future_dates_missing.


90-121 and 191-2 to recollect data