In [None]:
import pandas as pd
import subprocess
import os
import sys
from io import StringIO

# ==============================
# CONFIG
# ==============================
INPUT_TSV = r"C:\Users\dotru\STUDIE\FPTU\AiTA_Lab\dataset1\gnomAD\gnomad_chry_parsed.tsv"
OUTPUT_TSV = r"C:\Users\dotru\STUDIE\FPTU\AiTA_Lab\dataset1\gnomAD\gnomad_chry_mapped.tsv"
CACHE_DIR = r"C:\Users\dotru\STUDIE\FPTU\AiTA_Lab\vep_cache"  # nơi chứa homo_sapiens/115_GRCh38
ASSEMBLY = "GRCh38"
# ==============================

print(f"Đọc file TSV: {INPUT_TSV}")
df = pd.read_csv(INPUT_TSV, sep="\t")

required_cols = {"CHROM", "POS", "REF", "ALT"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"File TSV thiếu các cột: {required_cols - set(df.columns)}")

# workdir = thư mục nơi file TSV nằm → mount vào /input
workdir = os.path.dirname(os.path.abspath(INPUT_TSV))
host_workdir = workdir.replace("\\", "/")
host_cache = os.path.abspath(CACHE_DIR).replace("\\", "/")

TMP_VCF = os.path.join(workdir, "tmp_input.vcf")
TMP_VEP_OUT = os.path.join(workdir, "tmp_output.txt")

# ==============================
# Ghi file VCF tạm vào thư mục mountable (workdir)
# ==============================
print("Đang tạo file VCF tạm...")
with open(TMP_VCF, "w", newline="") as f:
    f.write("##fileformat=VCFv4.2\n")
    f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
    for _, row in df.iterrows():
        chrom = row["CHROM"]
        pos = int(row["POS"])
        ref = row["REF"]
        alt = row["ALT"]
        vid = row["ID"] if "ID" in row and pd.notna(row["ID"]) else "."
        f.write(f"{chrom}\t{pos}\t{vid}\t{ref}\t{alt}\t.\t.\t.\n")
print(f"Đã ghi file VCF tạm: {TMP_VCF}")

if not os.path.exists(TMP_VCF):
    print("File VCF tạm không tồn tại trên host. Dừng.")
    sys.exit(1)

print("Mounting host_workdir =", host_workdir)
print("Mounting host_cache  =", host_cache)

# ==============================
# Chạy VEP bằng Docker (với path đã chuẩn hóa)
# ==============================
vep_cmd = [
    "docker", "run", "--rm",
    "-v", f"{host_workdir}:/input",
    "-v", f"{host_cache}:/opt/vep/.vep",
    "ensemblorg/ensembl-vep",
    "vep",
    "-i", f"/input/{os.path.basename(TMP_VCF)}",
    "-o", f"/input/{os.path.basename(TMP_VEP_OUT)}",
    "--assembly", ASSEMBLY,
    "--cache", "--offline",
    "--symbol",
    "--check_existing",
    "--tab",
    "--fields", "Uploaded_variation,Location,Allele,Gene,Feature,Consequence,CLIN_SIG,Existing_variation",
    "--force_overwrite"
]

print("Đang chạy Ensembl VEP trong Docker...")
try:
    proc = subprocess.run(vep_cmd, check=True, capture_output=True, text=True)
    print("VEP chạy thành công!")
except subprocess.CalledProcessError as e:
    print("Lỗi khi chạy VEP:")
    print("STDERR:")
    print(e.stderr.strip() or "<empty>")
    print("STDOUT:")
    print(e.stdout.strip() or "<empty>")
    print("\nGợi ý debug: chạy thủ công trong terminal:")
    print(f"docker run --rm -v {host_workdir}:/input -v {host_cache}:/opt/vep/.vep ensemblorg/ensembl-vep ls -la /input")
    raise SystemExit("Dừng do lỗi VEP")

In [None]:
# ==============================
# Đọc kết quả VEP và merge
# ==============================
print("Đang đọc kết quả VEP...")
if not os.path.exists(TMP_VEP_OUT):
    print("File kết quả VEP không thấy trên host:", TMP_VEP_OUT)
    sys.exit(1)

# Tìm dòng bắt đầu bằng '#Uploaded_variation' làm header
with open(TMP_VEP_OUT) as f:
    lines = [line.strip() for line in f if line.strip() and not line.startswith("##")]
header_line = [l for l in lines if l.startswith("#Uploaded_variation")][0]
cols = header_line.lstrip("#").split("\t")
data_lines = [l for l in lines if not l.startswith("#")]
vep_df = pd.read_csv(StringIO("\n".join(data_lines)), sep="\t", names=cols)
vep_df = vep_df.dropna(subset=["Uploaded_variation"])

df["Uploaded_variation"] = (
    df["CHROM"].astype(str) + "_" +
    df["POS"].astype(str) + "_" +
    df["REF"].astype(str) + "/" +
    df["ALT"].astype(str)
)
vep_df["Uploaded_variation"] = vep_df["Uploaded_variation"].astype(str)

print("Đang merge dữ liệu...")
merged = df.merge(vep_df, on="Uploaded_variation", how="left")

print("Ghi file kết quả:", OUTPUT_TSV)
merged.to_csv(OUTPUT_TSV, sep="\t", index=False)
print("Hoàn tất! Tổng số variant:", len(merged))

Đang đọc kết quả VEP...
Đang merge dữ liệu...
Ghi file kết quả: C:\Users\dotru\STUDIE\FPTU\AiTA_Lab\dataset1\gnomAD\gnomad_chry_mapped.tsv
Hoàn tất! Tổng số variant: 4994492


In [None]:
df_mapped = pd.read_csv(OUTPUT_TSV, sep='\t')
df_mapped

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,AC_joint,AN_joint,AF_joint,...,faf95_genomes,faf99_genomes,Uploaded_variation,Location,Allele,Gene,Feature,Consequence,CLIN_SIG,Existing_variation
0,chrY,2781489,.,C,T,.,GENOMES_FILTERED,0,2716,0.0,...,0.0,0.0,chrY_2781489_C/T,chrY:2781489,T,ENSG00000251841,ENST00000516032,upstream_gene_variant,-,-
1,chrY,2781489,.,C,T,.,GENOMES_FILTERED,0,2716,0.0,...,0.0,0.0,chrY_2781489_C/T,chrY:2781489,T,ENSG00000290840,ENST00000679518,"intron_variant,non_coding_transcript_variant",-,-
2,chrY,2781489,.,C,T,.,GENOMES_FILTERED,0,2716,0.0,...,0.0,0.0,chrY_2781489_C/T,chrY:2781489,T,ENSG00000290840,ENST00000679825,"intron_variant,non_coding_transcript_variant",-,-
3,chrY,2781489,.,C,T,.,GENOMES_FILTERED,0,2716,0.0,...,0.0,0.0,chrY_2781489_C/T,chrY:2781489,T,ENSG00000290840,ENST00000680285,"intron_variant,non_coding_transcript_variant",-,-
4,chrY,2781489,.,C,T,.,GENOMES_FILTERED,0,2716,0.0,...,0.0,0.0,chrY_2781489_C/T,chrY:2781489,T,ENSG00000290840,ENST00000680845,"intron_variant,non_coding_transcript_variant",-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4994487,chrY,56887853,.,G,T,.,GENOMES_FILTERED,0,594,0.0,...,0.0,0.0,chrY_56887853_G/T,chrY:56887853,T,-,-,intergenic_variant,-,rs1602947650
4994488,chrY,56887855,.,T,C,.,GENOMES_FILTERED,0,549,0.0,...,0.0,0.0,chrY_56887855_T/C,chrY:56887855,C,-,-,intergenic_variant,-,-
4994489,chrY,56887856,.,T,A,.,GENOMES_FILTERED,0,519,0.0,...,0.0,0.0,chrY_56887856_T/A,chrY:56887856,A,-,-,intergenic_variant,-,-
4994490,chrY,56887899,.,T,C,.,GENOMES_FILTERED,0,0,,...,0.0,0.0,chrY_56887899_T/C,chrY:56887899,C,-,-,intergenic_variant,-,-


In [None]:
df_mapped.duplicated().sum()

0

In [None]:
df_mapped.isnull().sum()

CHROM                      0
POS                        0
ID                         0
REF                        0
ALT                        0
QUAL                       0
FILTER                     0
AC_joint                   0
AN_joint                   0
AF_joint                9275
AC_genomes                 0
AN_genomes                 0
AF_genomes             14830
AC_genomes_raw             0
AF_genomes_raw            58
faf95_genomes         822074
faf99_genomes         822074
Uploaded_variation         0
Location              154712
Allele                154712
Gene                  154712
Feature               154712
Consequence           154712
CLIN_SIG              154712
Existing_variation    154712
dtype: int64

In [None]:
df_mapped['CLIN_SIG'].value_counts()

-                         4839421
likely_benign                 182
uncertain_significance         92
benign                         49
pathogenic                     33
benign,likely_benign            3
Name: CLIN_SIG, dtype: int64