1. Download dependencies and model

In [2]:
from IPython.display import clear_output
!pip uninstall -y numpy
!pip install "numpy<2.0.0" --upgrade
! pip install alphagenome

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Defaulting to user installation because normal site-packages is not writeable
Collecting numpy<2.0.0
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4
Defaulting to user installation because normal site-packages is not writeable


2. Compute the AlphaGenome scores

In [None]:
import pandas as pd
from io import StringIO
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
from tqdm import tqdm
import numpy as np

# Obtain your API key from: https://deepmind.google.com/science/alphagenome/
API_KEY = ""

# Initialize the dna_model client
dna_model = dna_client.create(API_KEY)

# Configuration
CSV_PATH = "e.csv"  # Replace with your local CSV file
ORGANISM_NAME = "human"  # Options: "human", "mouse"
SEQUENCE_LENGTH = "1MB"  # Options: "2KB", "16KB", "100KB", "500KB", "1MB"

scorer_selections = {
    'rna_seq': True,
    'cage': True,
    'procap': True,
    'atac': True,
    'dnase': True,
    'chip_histone': True,
    'chip_tf': True,
    'polyadenylation': True,
    'splice_sites': True,
    'splice_site_usage': True,
    'splice_junctions': True,
}

# Initialize model and scorers
organism_map = {
    'human': dna_client.Organism.HOMO_SAPIENS,
    'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism = organism_map[ORGANISM_NAME]

sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
    f'SEQUENCE_LENGTH_{SEQUENCE_LENGTH}'
]

all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
    all_scorers[key]
    for key in all_scorers
    if scorer_selections.get(key.lower(), False)
]

unsupported_scorers = [
    scorer for scorer in selected_scorers
    if (
        organism.value not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
        or (
            scorer.requested_output == dna_client.OutputType.PROCAP
            and organism == dna_client.Organism.MUS_MUSCULUS
        )
    )
]
for scorer in unsupported_scorers:
    selected_scorers.remove(scorer)

# Load and preprocess input CSV
csv = pd.read_csv(CSV_PATH)
csv = csv.head(10)
print("Total rows in CSV:", len(csv))

csv["#CHROM"] = csv["#CHROM"].astype(str)
csv["#CHROM"] = csv["#CHROM"].apply(lambda x: x if x.startswith("chr") else f"chr{x}")

if 'variant_id' not in csv.columns:
    csv["variant_id"] = (
        csv["#CHROM"].astype(str) + "_" +
        csv["POS"].astype(str) + "_" +
        csv["REF"] + "_" +
        csv["ALT"] + "_b38"
    )

valid_chroms = {f"chr{i}" for i in range(1, 23)} | {"chrX", "chrY", "chrM"}
csv = csv[csv["#CHROM"].isin(valid_chroms)]

csv = csv[
    csv["REF"].str.match("^[ACGT]$") &
    csv["ALT"].str.match("^[ACGT]$")
].copy()

required_columns = ['variant_id', '#CHROM', 'POS', 'REF', 'ALT']
for col in required_columns:
    if col not in csv.columns:
        raise ValueError(f"Missing required column: {col}")

# Score variants and track max absolute values
quantile_max = {}
raw_max = {}

for i, row in tqdm(csv.iterrows(), total=len(csv), desc="Scoring variants"):
    try:
        variant = genome.Variant(
            chromosome=str(row["#CHROM"]),
            position=int(row["POS"]),
            reference_bases=row["REF"],
            alternate_bases=row["ALT"],
            name=row["variant_id"],
        )
        interval = variant.reference_interval.resize(sequence_length)

        variant_scores = dna_model.score_variant(
            interval=interval,
            variant=variant,
            variant_scorers=selected_scorers,
            organism=organism,
        )

        df_scores = variant_scorers.tidy_scores([variant_scores])
        df_scores["variant_id"] = df_scores["variant_id"].astype(str)

        def convert_variant_id(v):
            chrom, pos, change = v.split(":")
            ref, alt = change.split(">")
            return f"{chrom}_{pos}_{ref}_{alt}_b38"

        df_scores["variant_id"] = df_scores["variant_id"].apply(convert_variant_id)

        for _, s in df_scores.iterrows():
            vid = s["variant_id"]
            q_abs = abs(s["quantile_score"])
            r_abs = abs(s["raw_score"])

            if (vid not in quantile_max) or (q_abs > quantile_max[vid]):
                quantile_max[vid] = q_abs

            if (vid not in raw_max) or (r_abs > raw_max[vid]):
                raw_max[vid] = r_abs

    except Exception as e:
        print(f"\n Error scoring variant {i}: {e}")
        print("Variant record causing error:")
        print(row.to_dict())
        print("-" * 60)
        continue

# Build final result table and save
print("Building top-abs dataframe...")
df_top = pd.DataFrame([
    {
        "variant_id": vid,
        "quantile_abs": quantile_max.get(vid, np.nan),
        "raw_abs": raw_max.get(vid, np.nan),
    }
    for vid in csv["variant_id"]
])

df_final = csv.merge(df_top, on="variant_id", how="left")
df_final.to_csv("alphagenome.csv", index=False)
print("Scoring completed and results saved.")

Total rows in CSV: 10


Scoring variants: 100%|██████████| 10/10 [00:37<00:00,  3.76s/it]

Building top-abs dataframe...
Scoring completed and results saved.



