1. Download dependencies and hg38 genome

In [1]:
!pip install pyfaidx 
!pip install ipywidgets
!wget -c http://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.fa.gz
!gunzip -k hg38.fa.gz

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


2. Generate input sequences

In [None]:
import pandas as pd
import requests
from tqdm import tqdm
from pyfaidx import Fasta
import numpy as np
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

df = pd.read_csv("all.csv")
df = df.head(100)

# Load reference genome
genome = Fasta("hg38.fa")

# Set sequence window size
window = 4096

# Step 4: Extract flanking sequences around the variant
def get_sequence(row, flank_size=256):
    try:
        chrom = str(row["#CHROM"])
        if not chrom.startswith("chr"):
            chrom = "chr" + chrom

        pos = int(row["POS"])
        start = max(0, pos - flank_size - 1)
        end = pos + flank_size

        if chrom not in genome:
            return None

        seq = genome[chrom][start:end].seq.upper()

        if len(seq) != (2 * flank_size + 1):
            return None

        return seq
    except Exception as e:
        print(f"[⚠️ get_sequence] Error: {e}")
        return None

# Generate mutant sequence with ALT allele at the center
def generate_mutant_sequence(row, flank_size=256):
    try:
        seq = list(row["Context_Sequence"])
        mut_pos = flank_size  # Mutation is centered

        if len(row["REF"]) != 1 or len(row["ALT"]) != 1:
            return None

        if seq[mut_pos] != row["REF"]:
            return None

        seq[mut_pos] = row["ALT"]
        return "".join(seq)
    except Exception as e:
        print(f"[⚠️ generate_mutant_sequence] Error: {e}")
        return None

tqdm.pandas()

# Extract reference sequence around mutation
df["Context_Sequence"] = df.progress_apply(lambda row: get_sequence(row, flank_size=window), axis=1)
df.dropna(subset=["Context_Sequence"], inplace=True)

# Generate mutant sequences
df["Mutant_Sequence"] = df.progress_apply(lambda row: generate_mutant_sequence(row, flank_size=window), axis=1)
df.dropna(subset=["Mutant_Sequence"], inplace=True)

print(f"Successfully generated context and mutant sequences. Total valid records: {len(df)}")

3. Compute the Evo2 score

In [None]:
from evo2.models import Evo2
import torch

# Load model
model = Evo2('evo2_7b_base')

# Get reference and variant sequence lists
ref_seqs = df["Context_Sequence"].tolist()
var_seqs = df["Mutant_Sequence"].tolist()

# Check CUDA device
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name())

# Score sequences using the Evo 2 model
print(f"Scoring likelihoods of {len(ref_seqs)} reference sequences with Evo 2...")
ref_scores = model.score_sequences(ref_seqs, batch_size=20)

print(f"Scoring likelihoods of {len(var_seqs)} variant sequences with Evo 2...")
var_scores = model.score_sequences(var_seqs, batch_size=20)

# Store results and compute delta score
df["Ref_Score"] = ref_scores
df["Var_Score"] = var_scores
df["Delta_Score"] = df["Var_Score"] - df["Ref_Score"]

# Optionally drop sequences to save space
df.drop(columns=["Context_Sequence", "Mutant_Sequence"], inplace=True)

# Save to CSV
df.to_csv("evo2.csv", index=False)