In [1]:
import pandas as pd

df_music = pd.read_csv("data/music_main_2025-04-22.csv")
df_gidamps = pd.read_csv("data/gidamps_sampling_2025-04-22.csv")

In [2]:
df_music['cucq_date']# Convert cucq_date to datetime format first
df_music['cucq_date'] = pd.to_datetime(df_music['cucq_date']) 

# Filter for dates after Jan 1st 2025
cutoff_date = pd.to_datetime('2025-01-01')

music_future_dates = df_music[df_music['cucq_date'] > cutoff_date].shape[0]

# Convert sampling_date to datetime if not already done
df_gidamps['sampling_date'] = pd.to_datetime(df_gidamps['sampling_date'])

# Filter for non-NA cucq_5 and dates after cutoff
cutoff_date = pd.to_datetime('2025-01-01')
filtered_gidamps = df_gidamps[
    (df_gidamps['cucq_5'].notna()) & 
    (df_gidamps['sampling_date'] > cutoff_date)
]

print(f"Number of GIDAMPs rows with non-NA cucq_5 after Jan 1st 2025: {len(filtered_gidamps)}")

# Optional: Display first few rows to verify
print("\nFirst few rows of filtered data:")
print(filtered_gidamps[['sampling_date', 'cucq_5']].head())

print(f"Number of MUSIC rows after Jan 1st 2025: {music_future_dates}")

Number of GIDAMPs rows with non-NA cucq_5 after Jan 1st 2025: 1

First few rows of filtered data:
     sampling_date  cucq_5
1091    2025-02-03    14.0
Number of MUSIC rows after Jan 1st 2025: 35


In [3]:
df_music_existing = pd.read_csv("raw_data/gidamps_music_combined_fatigue_df_111224_with_cmh.csv")

# remove mid prefix from df_music['study_id']
df_music['study_id'] = df_music['study_id'].str.replace('MID-', '', regex=False)

In [5]:
music_future_dates = df_music[df_music['cucq_date'] > cutoff_date]

# check music_future_dates for study_ids in df_music_existing
existing_study_ids = df_music_existing['study_id'].unique()
music_future_study_ids = music_future_dates['study_id'].unique()
# Find study_ids in music_future_dates that are not in df_music_existing
missing_study_ids = set(music_future_study_ids) - set(existing_study_ids)
print(f"Number of unique study_ids in music_future_dates not in df_music_existing: {len(missing_study_ids)}")
print(f"Missing study_ids: {missing_study_ids}")

Number of unique study_ids in music_future_dates not in df_music_existing: 9
Missing study_ids: {'92-34', '90-121', '92-26', '92-35', '91-52', '92-33', '191-2', '92-28', '91-54'}


In [6]:
# subset music_futures_dates to only include the missing study_ids

music_future_dates_missing = music_future_dates[
    music_future_dates['study_id'].isin(missing_study_ids)
]

We could potentially add 10 more datapoints in from GI-DAMPs and MUSIC recruited after 2025-01-01.