In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score


from helper_functions import (
    map_diagnoses, identify_and_report_duplicates, 
    rank_and_deduplicate, prepare_target_columns, 
    calculate_deltas
)

In [2]:
df = pd.read_csv('../tadpole_challenge 3/ADNIMERGE.csv')

  df = pd.read_csv('../tadpole_challenge 3/ADNIMERGE.csv')


In [3]:
df['EXAMDATE'] = pd.to_datetime(df['EXAMDATE'])
df = df.sort_values(by=['RID', 'EXAMDATE'])
df['assumedDX'] = df.groupby('RID')['DX'].ffill()

dx_mapping = {'SMC': 'CN', 'LMCI': 'MCI', 'EMCI': 'MCI', 'AD': 'Dementia'}
dx_order = ['CN', 'MCI', 'Dementia']

In [4]:
df = map_diagnoses(df, dx_mapping)
df = identify_and_report_duplicates(df)
df = rank_and_deduplicate(df)
df = prepare_target_columns(df)
df = calculate_deltas(df)

110 Patients with double Month entries


  df[columns_to_fill] = df[columns_to_fill].fillna(method='ffill').fillna('DNA')


In [5]:
columns_for_model = [
        'VISCODE', 'SITE', 'COLPROT', 'ORIGPROT', 'DX_bl', 'AGE', 'PTGENDER', 'PTEDUCAT',
        'PTETHCAT', 'PTRACCAT', 'PTMARRY', 'APOE4', 'FDG', 'CDRSB', 'ADAS11', 'ADAS13',
        'ADASQ4', 'MMSE', 'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_forgetting',
        'RAVLT_perc_forgetting', 'LDELTOTAL', 'TRABSCOR', 'FAQ', 'DX', 'mPACCdigit',
        'mPACCtrailsB', 'CDRSB_bl', 'ADAS11_bl', 'ADAS13_bl', 'ADASQ4_bl',
        'MMSE_bl', 'RAVLT_immediate_bl', 'RAVLT_learning_bl', 'RAVLT_forgetting_bl',
        'RAVLT_perc_forgetting_bl', 'LDELTOTAL_BL', 'DIGITSCOR_bl', 'TRABSCOR_bl',
        'FAQ_bl', 'mPACCdigit_bl', 'mPACCtrailsB_bl', 'FLDSTRENG_bl', 'FSVERSION_bl',
        'Ventricles_bl', 'Hippocampus_bl', 'WholeBrain_bl', 'Entorhinal_bl', 'Fusiform_bl',
        'MidTemp_bl', 'ICV_bl', 'FDG_bl', 'PIB_bl',
        'Years_bl', 'Month_bl', 'Month', 'M', 'assumedDX', 'DX_bl_mapping',
        'CDRSB_delta', 'ADAS11_delta', 'ADAS13_delta', 'MMSE_delta', 'RAVLT_delta'
    ]
categorical_features = [
    'VISCODE', 'SITE', 'COLPROT', 'ORIGPROT', 'DX_bl', 'PTGENDER', 'PTEDUCAT',
    'PTETHCAT', 'PTRACCAT', 'PTMARRY', 'DX', 'FLDSTRENG_bl', 'FSVERSION_bl',
    'Month', 'M', 'assumedDX', 'DX_bl_mapping'
]
numeric_features = [col for col in columns_for_model if col not in categorical_features]

In [6]:
X, y = df[columns_for_model], df['DX6M']

# Encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [7]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


models = {
    'XGBoost': XGBClassifier(eval_metric='logloss'),
    'LightGBM': LGBMClassifier(verbose=-1),
    'CatBoost': CatBoostClassifier(verbose=0)
}
results = {}
for model_name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Extract feature importance
    if hasattr(model, 'feature_importances_'):
        # Extract and display feature importances for tree-based models
        # We need to access feature names from preprocessor
        feature_names = (numeric_features + 
                         list(preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out()))
        
        importances = model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values(by='Importance', ascending=False)
        
        print(f"{model_name} Feature Importances:")
        print(feature_importance_df.head(10))
        print()

    results[model_name] = {
        'CV Mean Accuracy': np.mean(cv_scores),
        'Test Accuracy': accuracy
    }


XGBoost Feature Importances:
            Feature  Importance
177           DX_CN    0.319670
179          DX_MCI    0.290191
178     DX_Dementia    0.174002
137  ORIGPROT_ADNI3    0.058709
123      SITE_141.0    0.005030
133   COLPROT_ADNI3    0.005010
48       VISCODE_bl    0.004459
41         Years_bl    0.004407
3             CDRSB    0.002914
56      VISCODE_m18    0.002897

LightGBM Feature Importances:
            Feature  Importance
41         Years_bl         450
0               AGE         392
13         TRABSCOR         324
32    Ventricles_bl         290
8   RAVLT_immediate         261
2               FDG         251
12        LDELTOTAL         248
38           ICV_bl         245
39           FDG_bl         233
15       mPACCdigit         233

CatBoost Feature Importances:
                Feature  Importance
179              DX_MCI   15.194575
231       assumedDX_MCI    6.583369
177               DX_CN    4.558094
229        assumedDX_CN    4.451149
230  assumedDX_Dementia  

In [8]:
# Print results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  CV Mean Accuracy: {metrics['CV Mean Accuracy']:.4f}")
    print(f"  Test Accuracy: {metrics['Test Accuracy']:.4f}")
    print()


XGBoost:
  CV Mean Accuracy: 0.9491
  Test Accuracy: 0.9557

LightGBM:
  CV Mean Accuracy: 0.9484
  Test Accuracy: 0.9577

CatBoost:
  CV Mean Accuracy: 0.9498
  Test Accuracy: 0.9608

