> **Data Source Acknowledgment**
>
> This dataset is derived from **The Vascular Contributors to Prodromal Alzheimer’s Disease (Emory_Vascular) study**.  
> Project homepage: [https://adknowledgeportal.synapse.org/Explore/Studies/DetailsPage/StudyDetails?Study=syn18909507](https://adknowledgeportal.synapse.org/Explore/Studies/DetailsPage/StudyDetails?Study=syn18909507)
>
> The specific data files and their Synapse links are listed below:  
>
> 1. *Emory_Vascular-Clinical.Cognitive.Cardiovascular.csv*  
>    Source: [https://www.synapse.org/Synapse:syn21317575](https://www.synapse.org/Synapse:syn21317575)
>
> 2. *599_Ihab_DESeq2_NormCounts_synapse.xlsx*  
>    Source: [https://www.synapse.org/Synapse:syn22219479](https://www.synapse.org/Synapse:syn22219479)
>
> 3. *Emory_Vascular_individual_human_metadata.csv*  
>    Source: [https://www.synapse.org/Synapse:syn21317577](https://www.synapse.org/Synapse:syn21317577)
>



# Process Clinical Data

## Load Clinical Data

In [None]:
import pandas as pd
import numpy as np
import os

clinical_data_path = "raw_data/Emory_Vascular-Clinical.Cognitive.Cardiovascular.csv"

clinical_data = pd.read_csv(clinical_data_path, na_values=["."])

process_data_path = "processed_data"
if not os.path.exists(process_data_path):
    os.makedirs(process_data_path)

display(clinical_data)

## Phenotype

In [None]:
columns_to_extract = {
    "individualID": "individual_id",
    "hxmi": "myocardial_infarction",
    "afib": "atrial_fibrillation",
    "angina": "angina",
    "hxchf": "congestive_heart_failure",
    "hxbp": "hypertension",
    "hxhighchol": "hypercholesterolemia"
}

phenotype_df = clinical_data[list(columns_to_extract.keys())].rename(columns=columns_to_extract)

id_col = "individual_id"
for col in phenotype_df.columns:
    if col != id_col:
        phenotype_df[col] = phenotype_df[col].fillna(-1).astype(int)

phenotype_df.to_csv(os.path.join(process_data_path, "phenotype.csv"), index=False)
print("Phenotype data saved")


## Exposure

In [None]:
columns_to_extract = {
    "individualID": "individual_id",
    "smoker": "tobacco_exposure",
    "drinker": "ethanol_exposure"
}

exposure_df = clinical_data[list(columns_to_extract.keys())].rename(columns=columns_to_extract)

id_col = "individual_id"
for col in exposure_df.columns:
    if col != id_col:
        exposure_df[col] = exposure_df[col].fillna(-1).astype(int)

exposure_df.to_csv(os.path.join(process_data_path, "exposure.csv"), index=False)

print("Exposure data saved")

# Process Gene Expression Data (Transcript)

In [None]:
import pandas as pd
import os

transcript_data_path = r"raw_data/599_Ihab_DESeq2_NormCounts_synapse.xlsx"
process_data_path = "processed_data"

if not os.path.exists(process_data_path):
    os.makedirs(process_data_path)

transcript_data_df = pd.read_excel(transcript_data_path)

display(transcript_data_df)

transcript_data_df = transcript_data_df.fillna(0)

transcript_data_df = transcript_data_df.drop(columns=["gene_name", "description"], errors="ignore")

# Transpose the DataFrame
transcript_data_df = transcript_data_df.set_index(transcript_data_df.columns[0])
transcript_data_df_t = transcript_data_df.T.reset_index()
transcript_data_df_t.rename(columns={"index": "sample_id"}, inplace=True)

# Clean up sample IDs
transcript_data_df_t["sample_id"] = transcript_data_df_t["sample_id"].apply(lambda x: str(x).split("_")[-1])

# Save the processed data
transcript_data_df_t.to_csv(os.path.join(process_data_path, "transcript.csv"), index=False)

print(f"Gene expression data saved")


# Process Label

In [None]:
import pandas as pd
import os

label_data_path = "raw_data/Emory_Vascular_individual_human_metadata.csv"

label_data_df = pd.read_csv(label_data_path)

display(label_data_df)

In [None]:
label_data_df = label_data_df[["individualID", "diagnosis"]].copy()
diagnosis_counts = label_data_df["diagnosis"].value_counts()

print("Diagnosis counts:")
print(diagnosis_counts)

In [None]:
# Encode the diagnosis labels, mapping "normal" to 0 and "mci (Mild Cognitive Impairment)" to 1, and others to -1
encode_map = {"normal": 0, "mci": 1}
label_data_df["diagnosis_code"] = label_data_df["diagnosis"].map(encode_map).fillna(-1).astype(int)

encoded_label_data_df = label_data_df[["individualID", "diagnosis_code"]]
encoded_label_data_df.to_csv(os.path.join(process_data_path, "label.csv"), index=False)

print("Label data saved")
