# Phenotype Formatting for GWAS Pipeline

This notebook prepares phenotype files used by the GWAS pipeline.

It keeps the core social isolation phenotypes only:
- Loneliness
- AbilityToConfide
- FreqVisit
- NumHousehold

Sex stratification and MR-specific splits are intentionally removed.


In [None]:
import os
import pandas as pd
from functools import reduce

INPUT_DIR = "/home/mabdel03/data/files/Isolation_Genetics/GWAS"
OUTPUT_DIR = "/home/mabdel03/data/files/Isolation_Genetics/GWAS/Scripts/ukb21942/pheno"

os.makedirs(OUTPUT_DIR, exist_ok=True)


## Load Raw Phenotype Files


In [None]:
df_Loneliness = pd.read_csv(
    f"{INPUT_DIR}/Isolation_Phenos/Pheno_Files/Loneliness/phenotype_2020.tsv",
    sep="\t"
)

df_AbilityToConfide = pd.read_csv(
    f"{INPUT_DIR}/Isolation_Phenos/Pheno_Files/AbilityToConfide/phenotype_2110.tsv",
    sep="\t"
)

df_FreqVisit = pd.read_csv(
    f"{INPUT_DIR}/Run2_Phenos/Create_Phenos/FreqVisits/FreqVisit.tsv",
    sep="\t"
)

df_NumHousehold = pd.read_csv(
    f"{INPUT_DIR}/Run2_Phenos/Create_Phenos/NumHousehold/NumHousehold.tsv",
    sep="\t"
)

df_list = [
    (df_Loneliness, "Loneliness"),
    (df_AbilityToConfide, "AbilityToConfide"),
    (df_FreqVisit, "FreqVisit"),
    (df_NumHousehold, "NumHousehold"),
]


## Fix Column Format and Merge


In [None]:
corrected_dfs = []

for df, phenotype_name in df_list:
    fid_vals = []
    iid_vals = []
    pheno_vals = []

    for val in df["FID"].astype(str):
        parts = val.split(" ")
        if len(parts) < 3:
            parts = val.split()

        if len(parts) >= 3:
            fid_vals.append(parts[0])
            iid_vals.append(parts[1])
            pheno_vals.append(parts[2])
        else:
            fid_vals.append(parts[0] if len(parts) > 0 else "NA")
            iid_vals.append(parts[1] if len(parts) > 1 else "NA")
            pheno_vals.append("NA")

    corrected_df = pd.DataFrame(
        {
            "FID": fid_vals,
            "IID": iid_vals,
            phenotype_name: pheno_vals,
        }
    )
    corrected_dfs.append(corrected_df)

master_df = reduce(
    lambda left, right: pd.merge(left, right, on=["FID", "IID"], how="outer"),
    corrected_dfs,
)


## Create Derived Traits


In [None]:
# Preserve raw phenotype values and map UKBB special codes to missing
for col in ["Loneliness", "AbilityToConfide", "FreqVisit", "NumHousehold"]:
    master_df[col] = master_df[col].astype(str).str.strip()
    master_df.loc[master_df[col].isin(["-1", "-3", "nan", "None"]), col] = "NA"

master_df["Loneliness_raw"] = master_df["Loneliness"]
master_df["AbilityToConfide_raw"] = master_df["AbilityToConfide"]
master_df["FreqVisit_raw"] = master_df["FreqVisit"]
master_df["NumHousehold_raw"] = master_df["NumHousehold"]

# Numeric views for continuous coding
loneliness_num = pd.to_numeric(master_df["Loneliness_raw"].replace("NA", pd.NA), errors="coerce")
ability_num = pd.to_numeric(master_df["AbilityToConfide_raw"].replace("NA", pd.NA), errors="coerce")
freqvisit_num = pd.to_numeric(master_df["FreqVisit_raw"].replace("NA", pd.NA), errors="coerce")
numhouse_num = pd.to_numeric(master_df["NumHousehold_raw"].replace("NA", pd.NA), errors="coerce")

# Binary coding (higher = more isolated)
master_df["Loneliness_binary"] = loneliness_num.map({0.0: "1", 1.0: "2"}).fillna("NA")

master_df["AbilityToConfide_binary"] = ability_num.apply(
    lambda x: "NA" if pd.isna(x) else ("2" if x == 0 else "1")
)

freqsoc_binary = []
for nh, fv in zip(numhouse_num, freqvisit_num):
    if pd.isna(nh) or pd.isna(fv):
        freqsoc_binary.append("NA")
    elif nh == 1 and fv in (6, 7):
        freqsoc_binary.append("2")
    else:
        freqsoc_binary.append("1")

master_df["FreqSoc_binary"] = freqsoc_binary

# Continuous coding (higher = more isolated)
master_df["Loneliness_continuous"] = loneliness_num
master_df["AbilityToConfide_continuous"] = 5.0 - ability_num

freqvisit_z = (freqvisit_num - freqvisit_num.mean()) / freqvisit_num.std()
numhouse_neg = -numhouse_num
numhouse_neg_z = (numhouse_neg - numhouse_neg.mean()) / numhouse_neg.std()

master_df["FreqSoc_continuous"] = (freqvisit_z + numhouse_neg_z) / 2
master_df.loc[freqvisit_num.isna() | numhouse_num.isna(), "FreqSoc_continuous"] = pd.NA

master_df["IID"] = master_df["FID"]


## Save Outputs


In [None]:
binary_df = master_df[["FID", "IID", "Loneliness_binary", "AbilityToConfide_binary", "FreqSoc_binary"]].copy()
binary_df.columns = ["FID", "IID", "Loneliness", "AbilityToConfide", "FreqSoc"]

continuous_df = master_df[["FID", "IID", "Loneliness_continuous", "AbilityToConfide_continuous", "FreqSoc_continuous"]].copy()
continuous_df.columns = ["FID", "IID", "Loneliness", "AbilityToConfide", "FreqSoc"]

binary_output_file = f"{OUTPUT_DIR}/isolation_run_binary.tsv.gz"
continuous_output_file = f"{OUTPUT_DIR}/isolation_run_continuous.tsv.gz"

binary_df.to_csv(binary_output_file, sep="\t", index=False, compression="gzip", na_rep="NA")
continuous_df.to_csv(continuous_output_file, sep="\t", index=False, compression="gzip", na_rep="NA")

print(f"Saved binary phenotype file: {binary_output_file}")
print(f"Saved continuous phenotype file: {continuous_output_file}")


In [None]:
print("Binary shape:", binary_df.shape)
print("Continuous shape:", continuous_df.shape)

print("\nBinary head:")
display(binary_df.head())

print("\nContinuous head:")
display(continuous_df.head())
