# Phenotype Formatting for GWAS Pipeline

This notebook prepares phenotype files used by the GWAS pipeline.

It keeps the core social isolation phenotypes only:
- Loneliness
- AbilityToConfide
- FreqVisit
- NumHousehold

Sex stratification and MR-specific splits are intentionally removed.


In [None]:
import os
import pandas as pd
from functools import reduce

INPUT_DIR = "/home/mabdel03/data/files/Isolation_Genetics/GWAS"
OUTPUT_DIR = "/home/mabdel03/data/files/Isolation_Genetics/GWAS/Scripts/ukb21942/pheno"

os.makedirs(OUTPUT_DIR, exist_ok=True)


## Load Raw Phenotype Files


In [None]:
df_Loneliness = pd.read_csv(
    f"{INPUT_DIR}/Isolation_Phenos/Pheno_Files/Loneliness/phenotype_2020.tsv",
    sep="\t"
)

df_AbilityToConfide = pd.read_csv(
    f"{INPUT_DIR}/Isolation_Phenos/Pheno_Files/AbilityToConfide/phenotype_2110.tsv",
    sep="\t"
)

df_FreqVisit = pd.read_csv(
    f"{INPUT_DIR}/Run2_Phenos/Create_Phenos/FreqVisits/FreqVisit.tsv",
    sep="\t"
)

df_NumHousehold = pd.read_csv(
    f"{INPUT_DIR}/Run2_Phenos/Create_Phenos/NumHousehold/NumHousehold.tsv",
    sep="\t"
)

df_list = [
    (df_Loneliness, "Loneliness"),
    (df_AbilityToConfide, "AbilityToConfide"),
    (df_FreqVisit, "FreqVisit"),
    (df_NumHousehold, "NumHousehold"),
]


## Fix Column Format and Merge


In [None]:
corrected_dfs = []

for df, phenotype_name in df_list:
    fid_vals = []
    iid_vals = []
    pheno_vals = []

    for val in df["FID"].astype(str):
        parts = val.split(" ")
        if len(parts) < 3:
            parts = val.split()

        if len(parts) >= 3:
            fid_vals.append(parts[0])
            iid_vals.append(parts[1])
            pheno_vals.append(parts[2])
        else:
            fid_vals.append(parts[0] if len(parts) > 0 else "NA")
            iid_vals.append(parts[1] if len(parts) > 1 else "NA")
            pheno_vals.append("NA")

    corrected_df = pd.DataFrame(
        {
            "FID": fid_vals,
            "IID": iid_vals,
            phenotype_name: pheno_vals,
        }
    )
    corrected_dfs.append(corrected_df)

master_df = reduce(
    lambda left, right: pd.merge(left, right, on=["FID", "IID"], how="outer"),
    corrected_dfs,
)


## Create Derived Traits


In [None]:
comp = []
for idx in master_df.index:
    if (
        (master_df.loc[idx, "NumHousehold"] == "1" and master_df.loc[idx, "FreqVisit"] == "7")
        or (master_df.loc[idx, "NumHousehold"] == "1" and master_df.loc[idx, "FreqVisit"] == "6")
    ):
        comp.append("2")
    elif master_df.loc[idx, "NumHousehold"] == "NA" or master_df.loc[idx, "FreqVisit"] == "NA":
        comp.append("NA")
    else:
        comp.append("1")

master_df["FreqSoc"] = comp

conf_binary = []
for idx in master_df.index:
    if master_df.loc[idx, "AbilityToConfide"] == "0":
        conf_binary.append("2")
    elif master_df.loc[idx, "AbilityToConfide"] == "NA":
        conf_binary.append("NA")
    else:
        conf_binary.append("1")

master_df["AbilityToConfide"] = conf_binary
master_df["Loneliness"] = master_df["Loneliness"].replace({"0": "1", "1": "2"})

master_df["IID"] = master_df["FID"]
master_df.fillna("NA", inplace=True)


## Save Output


In [None]:
final_df = master_df[["FID", "IID", "Loneliness", "AbilityToConfide", "FreqSoc"]].copy()

output_file = f"{OUTPUT_DIR}/isolation_run_control.tsv.gz"
final_df.to_csv(output_file, sep="\t", index=False, compression="gzip")

print(f"Saved phenotype file: {output_file}")


In [None]:
print("Shape:", final_df.shape)
final_df.head()
