In [None]:
## Author: Linda Karlsson, 2024

# import packages

import pandas as pd
import pickle
import numpy as np
from pathlib import PurePath
import os
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
import Fns_refprot_normalization as fns

# define path
path = PurePath(os.getcwd())
parents = path.parents 

#### This notebook runs univariate linear regression models that compare a biomarker alone versus in a ratio with a reference protein for tau pet Braak I-IV, tau pet Braak V-VI and Amyloid pet.

### Load data and create biomarker ratios

In [None]:
## Change path to where you have the raw data, load data and change back path.
os.chdir(str(path) + '') ## INSERT PATH
df = pd.read_csv('') ## INSERT NAME OF DATA FILE
os.chdir(str(path))

## if applicable, also do some data cleaning. We removed two outliers and used only baseline visits. For example:
df = df[df['Visit'] == 0]

In [None]:
#insert name of outcome variables
tau_pet_braak_I_IV = 'tnic_cho_com_I_IV' ## INSERT TEMPORAL TAU PET COMPOSITE
tau_pet_braak_V_VI = 'tnic_cho_com_V_VI' ## INSERT CORTICAL TAU PET COMPOSITE
amyloid_pet = 'fnc_ber_com_composite' ## INSERT AMYLOID PET COMPOSITE

#define list of biomarkers (adjust according to biomarker names in dataset)
csf_biomarkers = ['CSF_ptau217_Lilly',
                  'CSF_ptau181_Lilly',,
                  'CSF_MTBRtau243_WashU',
                  'CSF_ptau205_WashU',
                  'CSF_Ab42_Elecsys',
                  'CSF_SNAP25_UGOT',
                  'CSF_Neurogranin_NTK'
                 ]

plasma_biomarkers = ['Plasma_ptau217_Lilly',
                     'Plasma_ptau181_Lilly',
                     'Plasma_ptau205_WashU',
                     'Plasma_eMTBRtau243_WashU',
                     'Plasma_Ab42_WashU']

#define name of reference proteins
csf_refprot = 'CSF_Ab40_Elecsys'
plasma_refprot = 'Plasma_Ab40_WashU'

#add biomarker ratios to dataframe and save a list of their names
df,csf_biomarkers_normalized = fns.create_biomarker_ratios(df,csf_biomarkers,csf_refprot)
df,plasma_biomarkers_normalized = fns.create_biomarker_ratios(df,plasma_biomarkers,plasma_refprot)

# create dfs for each outcome
df_tau_pet_braak_I_IV = df[df[tau_pet_braak_I_IV].notnull()].reset_index(drop=True)
df_tau_pet_braak_V_VI = df[df[tau_pet_braak_V_VI].notnull()].reset_index(drop=True)
df_amyloid_pet = df[df[amyloid_pet].notnull()].reset_index(drop=True)

print('Length of df for tau PET braak I-IV: ' + str(len(df_tau_pet_braak_I_IV)))
print('Length of df for tau PET braak V-VI: ' + str(len(df_tau_pet_braak_V_VI)))
print('Length of df for amyloid PET: ' + str(len(df_amyloid_pet)))

## 1. Linear regression models, maximizing number of participants for each biomarker.

In [None]:
### In the corresponding manuscript, the number of iterations during bootstrapping linear regression models in 
#section 1 will be set to 10000, but this makes the running time quite long. I would recommend testing out the 
#code with a lower number of iterations, for example 100 or 1000 (which is the default for all functions)
# This parameter can be adjusted here:
n_iter = 10000 

In [None]:
# Set subset = True if you want to perform the analyses on the subset of individuals with all existing data available
# e.g., for head-to-head rankings

subset = False

In [None]:
## Create univariate linear regression models for each biomarker with max availible data

## CSF

#tau PET Braak I-IV
if subset:
    df_csf = df[csf_biomarkers+csf_biomarkers_normalized+[csf_refprot,tau_pet_braak_I_IV]].dropna().reset_index(drop=True)
else:
    df_csf = df_tau_pet_braak_I_IV.copy()
all_res_csf_tau_pet_braak_I_IV = fns.compare_biomarkers(df_csf,csf_biomarkers,csf_biomarkers_normalized,csf_refprot,outcome=tau_pet_braak_I_IV,n_iter=n_iter)
print('Tau pet braak I-IV done for CSF')

#tau PET Braak V-VI
if subset:
    df_csf = df[csf_biomarkers+csf_biomarkers_normalized+[csf_refprot,tau_pet_braak_V_VI]].dropna().reset_index(drop=True)
else:
    df_csf = df_tau_pet_braak_V_VI.copy()
all_res_csf_tau_pet_braak_V_VI = fns.compare_biomarkers(df_csf,csf_biomarkers,csf_biomarkers_normalized,csf_refprot,outcome=tau_pet_braak_V_VI,n_iter=n_iter)
print('Tau pet braak V-VI done for CSF')

#amyloid PET
if subset:
    df_csf = df[csf_biomarkers+csf_biomarkers_normalized+[csf_refprot,amyloid_pet]].dropna().reset_index(drop=True)
else:
    df_csf = df_amyloid_pet.copy()
all_res_csf_amyloid_pet = fns.compare_biomarkers(df_csf,csf_biomarkers,csf_biomarkers_normalized,csf_refprot,outcome=amyloid_pet,n_iter=n_iter)
print('Amyloid pet done for CSF')

#store results in list
all_res_csf = [all_res_csf_tau_pet_braak_I_IV,all_res_csf_tau_pet_braak_V_VI,all_res_csf_amyloid_pet]


## Plasma

#tau PET Braak I-IV
if subset:
    df_plasma = df[plasma_biomarkers+plasma_biomarkers_normalized+[plasma_refprot,tau_pet_braak_I_IV]].dropna().reset_index(drop=True)
else:
    df_plasma = df_tau_pet_braak_I_IV.copy()
all_res_plasma_tau_pet_braak_I_IV = fns.compare_biomarkers(df_plasma,plasma_biomarkers,plasma_biomarkers_normalized,plasma_refprot,outcome=tau_pet_braak_I_IV,n_iter=n_iter)
print('Tau pet braak I-IV done for plasma')

#tau PET Braak V-VI
if subset:
    df_plasma = df[plasma_biomarkers+plasma_biomarkers_normalized+[plasma_refprot,tau_pet_braak_V_VI]].dropna().reset_index(drop=True)
else:
    df_plasma = df_tau_pet_braak_V_VI.copy()
all_res_plasma_tau_pet_braak_V_VI = fns.compare_biomarkers(df_plasma,plasma_biomarkers,plasma_biomarkers_normalized,plasma_refprot,outcome=tau_pet_braak_V_VI,n_iter=n_iter)
print('Tau pet braak V-VI done for plasma')

#amyloid PET
if subset:
    df_plasma = df[plasma_biomarkers+plasma_biomarkers_normalized+[plasma_refprot,amyloid_pet]].dropna().reset_index(drop=True)
else:
    df_plasma = df_amyloid_pet.copy()
all_res_plasma_amyloid_pet = fns.compare_biomarkers(df_plasma,plasma_biomarkers,plasma_biomarkers_normalized,plasma_refprot,outcome=amyloid_pet,n_iter=n_iter)
print('Amyloid pet done for plasma')

#store results in list
all_res_plasma = [all_res_plasma_tau_pet_braak_I_IV,all_res_plasma_tau_pet_braak_V_VI,all_res_plasma_amyloid_pet]

In [None]:
## Get P-values and FDR correction
all_res_csf = fns.get_pvalues(all_res_csf,csf_biomarkers_normalized,n_iter=n_iter)
all_res_plasma = fns.get_pvalues(all_res_plasma,plasma_biomarkers_normalized,n_iter=n_iter)

## Save all results in a pickle file
results_all_lin = all_res_csf + all_res_plasma

with open('linreg_results.pkl', 'wb') as f:
    pickle.dump(results_all_lin, f)