1. Download dependencies and the pre-computed scores

In [5]:
!pip install pysam tqdm
!wget https://huggingface.co/datasets/songlab/gpn-msa-hg38-scores/resolve/main/scores.tsv.bgz
!wget https://huggingface.co/datasets/songlab/gpn-msa-hg38-scores/resolve/main/scores.tsv.bgz.tbi

Defaulting to user installation because normal site-packages is not writeable
--2025-08-02 15:20:56--  https://huggingface.co/datasets/songlab/gpn-msa-hg38-scores/resolve/main/scores.tsv.bgz
Resolving huggingface.co (huggingface.co)... 108.156.211.90, 108.156.211.51, 108.156.211.95, ...
Connecting to huggingface.co (huggingface.co)|108.156.211.90|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/65cd10f7b2e8d2486ae05692/eb0b78fc02866033c785d0a67570c91cc336329dad4c51db22796b442f32f60d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250802%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250802T152056Z&X-Amz-Expires=3600&X-Amz-Signature=beeebc775a950ab849f8f34b6e2034aae5b3841e3b68236c3084869c39694e86&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27scores.tsv.bgz%3B+filename%3D%22scores.tsv.bgz%22%3B&x-id=GetObje

2. Extract the GPNMSA scores

In [3]:
import pandas as pd
import pysam
from tqdm import tqdm

# Configure file paths
input_csv = "all.csv"
bgz_file = "scores.tsv.bgz"
output_csv = "gpnmsa.csv"

# Load variant data
df = pd.read_csv(input_csv)
df = df.head(100)
df["#CHROM"] = df["#CHROM"].astype(str)
df["POS"] = df["POS"].astype(int)

# Open tabix-indexed file
tbx = pysam.TabixFile(bgz_file)

# Prepare result list
gpn_scores = []

# Query GPN-MSA score for each variant
for _, row in tqdm(df.iterrows(), total=len(df), desc="Querying GPN-MSA"):
    chrom = row["#CHROM"]
    pos = row["POS"]
    ref = row["REF"]
    alt = row["ALT"]

    region = f"{chrom}:{pos}-{pos}"
    found = False
    score = None
    try:
        for record in tbx.fetch(region=region):
            fields = record.strip().split("\t")
            if len(fields) < 5:
                continue
            r_ref, r_alt, r_score = fields[2], fields[3], fields[4]
            if r_ref == ref and r_alt == alt:
                score = float(r_score)
                found = True
                break
    except Exception:
        pass

    gpn_scores.append(score)

# Add scores to DataFrame and save
df["GPN_MSA_score"] = gpn_scores
df.to_csv(output_csv, index=False)
print(f"Done! GPN-MSA scores saved to {output_csv}")

Querying GPN-MSA:   1%|          | 1/100 [00:00<00:10,  9.23it/s][W::bgzf_read_block] EOF marker is absent. The input may be truncated
Querying GPN-MSA: 100%|██████████| 100/100 [00:00<00:00, 837.04it/s]

Done! GPN-MSA scores saved to gpnmsa.csv



