In [45]:
import numpy as np
import os
import pandas as pd

pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.max_columns', 300) 


In [46]:
df = pd.read_csv('../tadpole_challenge 3/ADNIMERGE.csv')

  df = pd.read_csv('../tadpole_challenge 3/ADNIMERGE.csv')


In [47]:
# Ensure all datetime columns are correct type
df[['EXAMDATE', 'EXAMDATE_bl', 'update_stamp']] = df[['EXAMDATE', 'EXAMDATE_bl', 'update_stamp']].apply(pd.to_datetime)

In [48]:
# Process columns 'ABETA', 'TAU', and 'PTAU' to handle values with '<' and '>' symbols.
# For values with '<', the symbol is removed, and 1 is subtracted from the numeric value.
# For values with '>', the symbol is removed, and 1 is added to the numeric value.
# Finally, convert the processed columns to float data type.

for col in ['ABETA', 'TAU', 'PTAU']: 
    mask = df[col].str.contains('<').fillna(False)
    df.loc[mask, col] = df.loc[mask, col].str.replace('<', '').astype(float) - 1
    mask = df[col].str.contains('>').fillna(False)
    df.loc[mask, col] = df.loc[mask, col].str.replace('>', '').astype(float) + 1
    df[col] = df[col].astype(float)


In [49]:
df = df.sort_values(by=['RID', 'EXAMDATE'])

In [50]:
# Step 1: Remove rows where the Diagnosis column (DX) is NaN
# Reason: The model requires complete data for diagnosis to accurately track disease progression.
# Rows with missing diagnosis information cannot contribute meaningfully to the analysis.

df = df.dropna(subset=['DX']).reset_index(drop=True)

# Step 2: Filter out patients with only one visit
# Reason: The model focuses on disease progression, and patients with only one visit
# do not provide information on how the disease progresses over time.
# Without follow-up appointments, we cannot track or analyze the progression of the disease.

visit_counts = df.groupby('RID').size()
multiple_visits_rid = visit_counts[visit_counts > 1].index
df = df[df['RID'].isin(multiple_visits_rid)]

In [51]:
# Identify and remove patients who were diagnosed with "Dementia" at their baseline visit (VISCODE == 'bl')
# Reason: The focus might be on tracking disease progression from a non-demented state, so patients
# who start with a dementia diagnosis may not be relevant for this specific analysis.

rids_to_drop = []
for rid in np.unique(df['RID']):
    # Filter the DataFrame to include only rows for the current patient (RID)
    df_rid = df.loc[df['RID'] == rid]
    
    # Check if the baseline diagnosis is "Dementia"
    if df_rid.loc[df_rid['VISCODE'] == 'bl', 'DX'].values == 'Dementia': 
        rids_to_drop.append(rid)

# Drop patients whose baseline diagnosis was "Dementia"
df = df.loc[~df['RID'].isin(rids_to_drop)].reset_index(drop=True)


In [52]:
# Identify and remove patients whose diagnosis history indicates a reversal or improvement
# in condition, which may be problematic for the analysis of disease progression.

rids_to_drop = []
for rid in np.unique(df['RID']):
    df_rid = df.loc[df['RID'] == rid].sort_values(by=['M']).reset_index(drop=True)
    flag = False
    for i in range(len(df_rid) - 1):
        dx_curr = df_rid.loc[i, 'DX']
        if dx_curr == 'MCI':
            if 'CN' in df_rid.loc[i + 1:, 'DX'].values:
                flag = True
                break
        elif dx_curr == 'Dementia':
            if ('CN' in df_rid.loc[i + 1:, 'DX'].values) or ('MCI' in df_rid.loc[i + 1:, 'DX'].values):
                flag = True
                break
    if flag:
        rids_to_drop.append(rid)

# Remove patients with diagnosis history indicating reversal/improvement
df = df.loc[~df['RID'].isin(rids_to_drop)].reset_index(drop=True)

In [53]:
# Create new columns for each interval (6, 12, 24, 48, 60 months) that contain the DX value 
# for the same RID but at a future month corresponding to the current Month + interval. 
# If no corresponding DX value is found, the column is filled with None.

intervals = [6, 12, 24, 48, 60]
for interval in intervals:
    col_name = f'DX_{interval}M'
    df[col_name] = df.apply(
        lambda row: df[(df['RID'] == row['RID']) & (df['Month'] == row['Month'] + interval)]['DX'].values, axis=1
    )
    df[col_name] = df[col_name].apply(lambda x: x[0] if len(x) > 0 else 'DidNotAttend')

In [54]:
bl_df = df[df['VISCODE']=='bl']

In [55]:
os.makedirs('processed_data', exist_ok=True)
bl_df.to_csv('processed_data/bl_df.csv', index=False)
