In [1]:
import pandas as pd
import subprocess
import os
import sys
from io import StringIO

# ==============================
# CONFIG
# ==============================
INPUT_CSV = r"variant_official_final_clean.csv"
OUTPUT_CSV = r"variant_annotated_official.csv"
CACHE_DIR = r"vep_cache"  # nơi chứa homo_sapiens/115_GRCh38
ASSEMBLY = "GRCh38"
# ==============================

print(f"Đọc file CSV: {INPUT_CSV}")
df = pd.read_csv(INPUT_CSV)
df

Đọc file CSV: variant_official_final_clean.csv


Unnamed: 0,AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,PhenotypeIDS,PhenotypeList,...,Assembly,ChromosomeAccession,CHROM,Start,Stop,ReviewStatus,VariationID,POS,REF,ALT
0,3544463,Deletion,NM_001005484.2(OR4F5):c.9+1354del,79501,OR4F5,HGNC:14825,Uncertain significance,0,"Human Phenotype Ontology:HP:0000547,MONDO:MOND...",Retinitis pigmentosa,...,GRCh38,NC_000001.11,chr1,66927,66927,"criteria provided, single submitter",3385321,66926,AG,A
1,2193183,single nucleotide variant,NM_001005484.2(OR4F5):c.107A>G (p.Glu36Gly),79501,OR4F5,HGNC:14825,Likely benign,0,MedGen:CN169374,not specified,...,GRCh38,NC_000001.11,chr1,69134,69134,"criteria provided, single submitter",2205837,69134,A,G
2,4039319,single nucleotide variant,NM_001005484.2(OR4F5):c.281A>G (p.Lys94Arg),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,GRCh38,NC_000001.11,chr1,69308,69308,"criteria provided, single submitter",3925305,69308,A,G
3,3374047,single nucleotide variant,NM_001005484.2(OR4F5):c.287T>G (p.Ile96Ser),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,GRCh38,NC_000001.11,chr1,69314,69314,"criteria provided, single submitter",3205580,69314,T,G
4,4039320,single nucleotide variant,NM_001005484.2(OR4F5):c.377T>C (p.Met126Thr),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,GRCh38,NC_000001.11,chr1,69404,69404,"criteria provided, single submitter",3925306,69404,T,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3875326,706348,single nucleotide variant,NM_002186.3(IL9R):c.991G>A (p.Gly331Arg),3581,IL9R,HGNC:6030,Benign,0,MedGen:C3661900,not provided,...,GRCh38,NC_000024.10,chrY,57196354,57196354,"criteria provided, multiple submitters, no con...",769193,57196354,G,A
3875327,706349,single nucleotide variant,NM_002186.3(IL9R):c.1094G>A (p.Arg365His),3581,IL9R,HGNC:6030,Benign,0,MedGen:C3661900,not provided,...,GRCh38,NC_000024.10,chrY,57196457,57196457,"criteria provided, multiple submitters, no con...",771064,57196457,G,A
3875328,4197692,single nucleotide variant,NC_000023.11:g.156022415C>A,-1,,,Likely benign,0,MedGen:C3661900,not provided,...,GRCh38,NC_000024.10,chrY,57208935,57208935,"criteria provided, single submitter",4085007,57208935,C,A
3875329,3184976,single nucleotide variant,NC_000023.11:g.156023032T>C,-1,,,Likely benign,0,MedGen:C3661900,not provided,...,GRCh38,NC_000024.10,chrY,57209552,57209552,"criteria provided, single submitter",3024664,57209552,T,C


In [2]:
required_cols = {"CHROM", "POS", "REF", "ALT"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"File CSV thiếu các cột: {required_cols - set(df.columns)}")

# workdir = thư mục nơi file CSV nằm → mount vào /input
workdir = os.path.dirname(os.path.abspath(INPUT_CSV))
host_workdir = workdir.replace("\\", "/")
host_cache = os.path.abspath(CACHE_DIR).replace("\\", "/")
fasta_filename = "Homo_sapiens.GRCh38.dna.primary_assembly.fa" 

TMP_VCF = os.path.join(workdir, "tmp_input.vcf")
TMP_VEP_OUT = os.path.join(workdir, "tmp_output.txt")

# ==============================
# Ghi file VCF tạm vào thư mục mountable (workdir)
# ==============================
print("Đang tạo file VCF tạm...")
with open(TMP_VCF, "w", newline="") as f:
    f.write("##fileformat=VCFv4.2\n")
    f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
    for _, row in df.iterrows():
        chrom = row["CHROM"]
        pos = int(row["POS"])
        ref = row["REF"]
        alt = row["ALT"]
        vid = row["ID"] if "ID" in row and pd.notna(row["ID"]) else "."
        f.write(f"{chrom}\t{pos}\t{vid}\t{ref}\t{alt}\t.\t.\t.\n")
print(f"Đã ghi file VCF tạm: {TMP_VCF}")

if not os.path.exists(TMP_VCF):
    print("File VCF tạm không tồn tại trên host. Dừng.")
    sys.exit(1)

print("Mounting host_workdir =", host_workdir)
print("Mounting host_cache  =", host_cache)

# ==============================
# Chạy VEP bằng Docker (với path đã chuẩn hóa)
# ==============================
vep_cmd = [
    "docker", "run", "--rm",
    "-v", f"{host_workdir}:/input",
    "-v", f"{host_cache}:/opt/vep/.vep",
    "ensemblorg/ensembl-vep",
    "vep",
    "-i", f"/input/{os.path.basename(TMP_VCF)}",
    "-o", f"/input/{os.path.basename(TMP_VEP_OUT)}",
    "--assembly", ASSEMBLY,
    "--cache", "--offline",
    "--fasta", f"/input/{fasta_filename}",
    "--tab",
    "--hgvs",
    "--pick",
    "--canonical",
    "--protein",
    "--fields", "Uploaded_variation,Consequence,HGVSc,Feature,Protein_id,Protein_position,Amino_acids,Codons",
    "--force_overwrite"
]

print("Đang chạy Ensembl VEP trong Docker...")
try:
    proc = subprocess.run(vep_cmd, check=True, capture_output=True, text=True)
    print("VEP chạy thành công!")
except subprocess.CalledProcessError as e:
    print("Lỗi khi chạy VEP:")
    print("STDERR:")
    print(e.stderr.strip() or "<empty>")
    print("STDOUT:")
    print(e.stdout.strip() or "<empty>")
    raise SystemExit("Dừng do lỗi VEP")

Đang tạo file VCF tạm...
Đã ghi file VCF tạm: d:\Biosequence\tmp_input.vcf
Mounting host_workdir = d:/Biosequence
Mounting host_cache  = d:/Biosequence/vep_cache
Đang chạy Ensembl VEP trong Docker...
VEP chạy thành công!


In [3]:
# ==============================
# Đọc kết quả VEP và merge
# ==============================
print("Đang đọc kết quả VEP...")
if not os.path.exists(TMP_VEP_OUT):
    print("File kết quả VEP không thấy trên host:", TMP_VEP_OUT)
    sys.exit(1)

# Tìm dòng bắt đầu bằng '#Uploaded_variation' làm header
with open(TMP_VEP_OUT) as f:
    lines = [line.strip() for line in f if line.strip() and not line.startswith("##")]
header_line = [l for l in lines if l.startswith("#Uploaded_variation")][0]
cols = header_line.lstrip("#").split("\t")
data_lines = [l for l in lines if not l.startswith("#")]
vep_df = pd.read_csv(StringIO("\n".join(data_lines)), sep="\t", names=cols)
vep_df = vep_df.dropna(subset=["Uploaded_variation"])

df["Uploaded_variation"] = (
    df["CHROM"].astype(str) + "_" +
    df["POS"].astype(str) + "_" +
    df["REF"].astype(str) + "/" +
    df["ALT"].astype(str)
)
vep_df["Uploaded_variation"] = vep_df["Uploaded_variation"].astype(str)

print("Đang merge dữ liệu...")
merged = df.merge(vep_df, on="Uploaded_variation", how="left")

print("Ghi file kết quả:", OUTPUT_CSV)
merged.to_csv(OUTPUT_CSV, index=False)
print("Hoàn tất! Tổng số variant:", len(merged))
merged

Đang đọc kết quả VEP...
Đang merge dữ liệu...
Ghi file kết quả: variant_annotated_official.csv
Hoàn tất! Tổng số variant: 3875335


Unnamed: 0,AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,PhenotypeIDS,PhenotypeList,...,REF,ALT,Uploaded_variation,Consequence,HGVSc,Feature,Protein_id,Protein_position,Amino_acids,Codons
0,3544463,Deletion,NM_001005484.2(OR4F5):c.9+1354del,79501,OR4F5,HGNC:14825,Uncertain significance,0,"Human Phenotype Ontology:HP:0000547,MONDO:MOND...",Retinitis pigmentosa,...,AG,A,chr1_66926_AG/A,,,,,,,
1,2193183,single nucleotide variant,NM_001005484.2(OR4F5):c.107A>G (p.Glu36Gly),79501,OR4F5,HGNC:14825,Likely benign,0,MedGen:CN169374,not specified,...,A,G,chr1_69134_A/G,missense_variant,ENST00000641515.2:c.107A>G,ENST00000641515,-,36,E/G,gAa/gGa
2,4039319,single nucleotide variant,NM_001005484.2(OR4F5):c.281A>G (p.Lys94Arg),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,A,G,chr1_69308_A/G,missense_variant,ENST00000641515.2:c.281A>G,ENST00000641515,-,94,K/R,aAg/aGg
3,3374047,single nucleotide variant,NM_001005484.2(OR4F5):c.287T>G (p.Ile96Ser),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,T,G,chr1_69314_T/G,missense_variant,ENST00000641515.2:c.287T>G,ENST00000641515,-,96,I/S,aTt/aGt
4,4039320,single nucleotide variant,NM_001005484.2(OR4F5):c.377T>C (p.Met126Thr),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,T,C,chr1_69404_T/C,missense_variant,ENST00000641515.2:c.377T>C,ENST00000641515,-,126,M/T,aTg/aCg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3875330,706348,single nucleotide variant,NM_002186.3(IL9R):c.991G>A (p.Gly331Arg),3581,IL9R,HGNC:6030,Benign,0,MedGen:C3661900,not provided,...,G,A,chrY_57196354_G/A,missense_variant,ENST00000711289.2:c.991G>A,ENST00000711289,-,331,G/R,Ggg/Agg
3875331,706349,single nucleotide variant,NM_002186.3(IL9R):c.1094G>A (p.Arg365His),3581,IL9R,HGNC:6030,Benign,0,MedGen:C3661900,not provided,...,G,A,chrY_57196457_G/A,missense_variant,ENST00000711289.2:c.1094G>A,ENST00000711289,-,365,R/H,cGt/cAt
3875332,4197692,single nucleotide variant,NC_000023.11:g.156022415C>A,-1,,,Likely benign,0,MedGen:C3661900,not provided,...,C,A,chrY_57208935_C/A,synonymous_variant,ENST00000711285.2:c.540C>A,ENST00000711285,-,180,I,atC/atA
3875333,3184976,single nucleotide variant,NC_000023.11:g.156023032T>C,-1,,,Likely benign,0,MedGen:C3661900,not provided,...,T,C,chrY_57209552_T/C,synonymous_variant,ENST00000711285.2:c.741T>C,ENST00000711285,-,247,Y,taT/taC


In [4]:
merged['Consequence'].value_counts()

Consequence
missense_variant                                                                                            2046055
synonymous_variant                                                                                           633594
intron_variant                                                                                               269068
downstream_gene_variant                                                                                      120652
splice_polypyrimidine_tract_variant,intron_variant                                                            80551
                                                                                                             ...   
splice_donor_variant,coding_sequence_variant,3_prime_UTR_variant                                                  1
splice_acceptor_variant,splice_donor_5th_base_variant,splice_polypyrimidine_tract_variant,intron_variant          1
splice_region_variant,intron_variant,NMD_transcript_variant 

In [5]:
merged.isnull().sum()

AlleleID                     0
Type                         0
Name                         0
GeneID                       0
GeneSymbol                 773
HGNC_ID                   5232
ClinicalSignificance         0
ClinSigSimple                0
PhenotypeIDS             56639
PhenotypeList             2973
Origin                       0
OriginSimple                 0
Assembly                     0
ChromosomeAccession          0
CHROM                        0
Start                        0
Stop                         0
ReviewStatus                 0
VariationID                  0
POS                          0
REF                          0
ALT                          0
Uploaded_variation           0
Consequence             258320
HGVSc                   258320
Feature                 258320
Protein_id              258320
Protein_position        258320
Amino_acids             258323
Codons                  258320
dtype: int64

In [6]:
merged = merged.dropna(subset=['Consequence'])
merged.isnull().sum()

AlleleID                    0
Type                        0
Name                        0
GeneID                      0
GeneSymbol                635
HGNC_ID                  4512
ClinicalSignificance        0
ClinSigSimple               0
PhenotypeIDS            51711
PhenotypeList            2652
Origin                      0
OriginSimple                0
Assembly                    0
ChromosomeAccession         0
CHROM                       0
Start                       0
Stop                        0
ReviewStatus                0
VariationID                 0
POS                         0
REF                         0
ALT                         0
Uploaded_variation          0
Consequence                 0
HGVSc                       0
Feature                     0
Protein_id                  0
Protein_position            0
Amino_acids                 3
Codons                      0
dtype: int64

In [7]:
merged = merged.dropna(subset=['HGVSc'])
merged.isnull().sum()

AlleleID                    0
Type                        0
Name                        0
GeneID                      0
GeneSymbol                635
HGNC_ID                  4512
ClinicalSignificance        0
ClinSigSimple               0
PhenotypeIDS            51711
PhenotypeList            2652
Origin                      0
OriginSimple                0
Assembly                    0
ChromosomeAccession         0
CHROM                       0
Start                       0
Stop                        0
ReviewStatus                0
VariationID                 0
POS                         0
REF                         0
ALT                         0
Uploaded_variation          0
Consequence                 0
HGVSc                       0
Feature                     0
Protein_id                  0
Protein_position            0
Amino_acids                 3
Codons                      0
dtype: int64

In [8]:
merged.to_csv(r'variant_annotated_official_clean.csv', index=False)