In [5]:
import pandas as pd
from collections import defaultdict

FASTA = "antibodies.fasta"
CDR = "core12_cdrs.csv"
OUT = "figure3_framework_hydrophobicity.csv"

HYDRO = set(["A","V","I","L","M","F","W","Y"])

cdr_df = pd.read_csv(CDR)

cdr_sequences = defaultdict(list)
for _, r in cdr_df.iterrows():
    cdr_sequences[(r["Antibody"], r["Chain"])].append(r["Sequence"])

rows = []

with open(FASTA) as f:
    ab, chain, seq = None, None, ""
    for line in f:
        line = line.strip()
        if line.startswith(">"):
            if seq:
                cdr_pos = set()
                for cdr_seq in cdr_sequences.get((ab, chain), []):
                    start = seq.find(cdr_seq)
                    if start != -1:
                        for i in range(start + 1, start + len(cdr_seq) + 1):
                            cdr_pos.add(i)
                for i, aa in enumerate(seq, start=1):
                    region = "CDR" if i in cdr_pos else "Framework"
                    rows.append({
                        "Antibody": ab,
                        "Chain": chain,
                        "Position": i,
                        "AA": aa,
                        "Region": region,
                        "Hydrophobic": aa in HYDRO
                    })
            header = line[1:]
            ab, chain = header.rsplit("_", 1)
            seq = ""
        else:
            seq += line
    if seq:
        cdr_pos = set()
        for cdr_seq in cdr_sequences.get((ab, chain), []):
            start = seq.find(cdr_seq)
            if start != -1:
                for i in range(start + 1, start + len(cdr_seq) + 1):
                    cdr_pos.add(i)
        for i, aa in enumerate(seq, start=1):
            region = "CDR" if i in cdr_pos else "Framework"
            rows.append({
                "Antibody": ab,
                "Chain": chain,
                "Position": i,
                "AA": aa,
                "Region": region,
                "Hydrophobic": aa in HYDRO
            })

df = pd.DataFrame(rows)

summary = (
    df[df["Region"] == "Framework"]
    .groupby("Antibody")
    .agg(
        framework_len=("AA","count"),
        hydrophobic_count=("Hydrophobic","sum")
    )
    .reset_index()
)

summary["hydrophobic_ratio"] = (
    summary["hydrophobic_count"] / summary["framework_len"]
)

summary.to_csv(OUT, index=False)
print(summary)

     Antibody  framework_len  hydrophobic_count  hydrophobic_ratio
0        1bvl            468                172           0.367521
1        4lvh            866                300           0.346420
2  Antibody_A            360                130           0.361111
3  Antibody_B            360                214           0.594444
4  Antibody_C            358                134           0.374302
