Loading the dataset

In [1]:
%pip install pandas
import pandas as pd

df = pd.read_csv('merged_npx_clinical.csv')



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


Each row has 1 patient per visit and each UniProt protein becomes a feature

In [2]:
df_pivot = df.pivot_table(
    index=['patient_id', 'visit_month', 'visit_id'],
    columns='UniProt',
    values='NPX'
).reset_index()

UPDRS scores and medication state per visit and merge them with the pivoted biomarker data

In [3]:
clinical_cols = ['patient_id', 'visit_month', 'visit_id', 
                 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 
                 'upd23b_clinical_state_on_medication']

df_clinical = df[clinical_cols].drop_duplicates(subset=['visit_id'])

# Merge biomarker features and clinical data
df_merged = pd.merge(df_pivot, df_clinical, on=['patient_id', 'visit_month', 'visit_id'], how='left')


Create a prediction label

In [4]:
df_merged['label'] = (
    (df_merged['updrs_3'] >= 10) | 
    (df_merged['upd23b_clinical_state_on_medication'] == 'On')
).astype(int)


Save the file

In [None]:
df_merged.to_csv('reshaped_early_pd_dataset.csv', index=False)

In [6]:
%pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------------- ------------------ 30.7/60.8 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 463.5 kB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.1 MB 3.3 MB/s eta 0:00:04
    --------------------------------------- 0.2/11.1 MB 2.5 MB/s eta 0:00:05
   - -------------------------------------- 0.5/11.1 MB 3.6 MB/s eta 


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer

# Load dataset
df = pd.read_csv("reshaped_early_pd_dataset.csv")

# 1. Separate feature types
clinical_features = [col for col in df.columns if 'updrs' in col or 'clinical' in col]
biomarker_features = [col for col in df.columns if col.startswith('P') or col.startswith('Q') or col.startswith('O')]

# 2. Drop features with >50% missing (optional for modeling)
threshold = 0.5
missing_fraction = df.isnull().mean()
to_drop = missing_fraction[missing_fraction > threshold].index.tolist()
df_cleaned = df.drop(columns=to_drop)

# 3. Impute clinical features (mode for categorical, median for numeric)
for col in clinical_features:
    if col in df_cleaned.columns:
        if df_cleaned[col].dtype == 'object':
            imp = SimpleImputer(strategy='most_frequent')
        else:
            imp = SimpleImputer(strategy='median')
        df_cleaned[col] = imp.fit_transform(df_cleaned[[col]])

# 4. Impute biomarker features using KNN
biomarker_features_kept = [col for col in biomarker_features if col in df_cleaned.columns]
knn_imputer = KNNImputer(n_neighbors=5)
df_cleaned[biomarker_features_kept] = knn_imputer.fit_transform(df_cleaned[biomarker_features_kept])

# 5. Optional: Add imputation flags
for col in biomarker_features_kept:
    df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)

# Save cleaned dataset
df_cleaned.to_csv("cleaned_early_pd_dataset.csv", index=False)


  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_cleaned[col + "_was_imputed"] = df[col].isnull().astype(int)
  df_clean