In [1]:
import pandas as pd
from pyfaidx import Fasta
import re
import sys

# --- 1. C·∫•u h√¨nh ---
fasta_path = r"D:\Study\5-FA25\AiTa_Lab_Research\Map\Homo_sapiens.GRCh38.dna.primary_assembly.fa"
annotated_variant_file = r"D:\Study\5-FA25\AiTa_Lab_Research\variant_annotated_official_clean.csv"
output_file = r"D:\Study\5-FA25\AiTa_Lab_Research\variant_sequence_positive.csv"

WINDOW = 300
TARGET_LEN = (WINDOW * 2) + 1
PAD_CHAR = 'N'

In [2]:
# --- 2. C√°c h√†m ti·ªán √≠ch ---
def normalize_chrom(chrom):
    """Chu·∫©n h√≥a t√™n chromosome."""
    chrom = str(chrom).strip()
    if chrom.startswith("NC_"):
        num = chrom.split("_")[-1].split(".")[0]
        if num == "000001": return "1"
        elif num == "000002": return "2"
        elif num == "000023": return "X"
        elif num == "000024": return "Y"
        elif num == "000025": return "MT"
        else:
            try:
                num_int = int(num)
                return str(num_int)
            except:
                return chrom
    return chrom.replace("chr", "")

def normalize_chromosome_output(chrom_name):
    """
    D√πng ƒë·ªÉ chu·∫©n h√≥a t√™n chromosome tr∆∞·ªõc khi xu·∫•t file CSV.
    Chuy·ªÉn v·ªÅ d·∫°ng UCSC (chr1, chr2...) v√† lo·∫°i b·ªè contig r√°c.
    """
    chrom_str = str(chrom_name).strip()

    # L·ªçc b·ªè c√°c contig kh√¥ng x√°c ƒë·ªãnh (Unplaced/random/alt)
    if 'Un' in chrom_str or 'random' in chrom_str or 'alt' in chrom_str:
        return None # S·∫Ω tr·∫£ v·ªÅ None ƒë·ªÉ ta drop h√†ng n√†y sau

    # X·ª≠ l√Ω c√°c tr∆∞·ªùng h·ª£p ƒë·∫∑c bi·ªát (MT -> chrM)
    if chrom_str.upper() == 'MT':
        return 'chrM'

    # N·∫øu n√≥ ƒë√£ c√≥ 'chr' r·ªìi th√¨ th√¥i
    if chrom_str.startswith('chr'):
        return chrom_str

    # N·∫øu n√≥ l√† s·ªë (1, 2, X, Y) th√¨ th√™m 'chr'
    return 'chr' + chrom_str

def parse_hgvsc_offset(hgvsc_string):
    """H∆∞·ªõng 1: Regex ch·∫∑t ch·∫Ω h∆°n, y√™u c·∫ßu d·∫•u +/- ph·∫£i ƒë·ª©ng sau 1 con s·ªë"""
    if not isinstance(hgvsc_string, str):
        return None
    
    # Regex gi·∫£i th√≠ch:
    # c\.       : B·∫Øt ƒë·∫ßu b·∫±ng c.
    # .*?       : C√°c k√Ω t·ª± ·ªü gi·ªØa
    # (?<=\d|\*) : LOOKBEHIND - K√Ω t·ª± ƒë·ª©ng ngay tr∆∞·ªõc d·∫•u +/- PH·∫¢I l√† s·ªë ho·∫∑c d·∫•u * (cho 3'UTR)
    # ([+-])    : Nh√≥m 1 (D·∫•u)
    # (\d+)     : Nh√≥m 2 (Gi√° tr·ªã Offset)
    match = re.search(r'c\..*?(?<=\d|\*)([+-])(\d+)', hgvsc_string)
    
    if match:
        sign = match.group(1)
        value = int(match.group(2))
        return -value if sign == '-' else value
    return None

def get_ref_seq(genome, chrom, center_1based, window):
    """L·∫•y chu·ªói DNA 601bp, t·ª± ƒë·ªông pad 'N'."""
    target_len = (window * 2) + 1
    start_0based = center_1based - 1 - window
    end_0based = center_1based + window 
    
    try:
        seq = genome[chrom][start_0based:end_0based].upper()
        pad_left = max(0, -start_0based)
        pad_right = max(0, end_0based - len(genome[chrom]))
        final_seq = (PAD_CHAR * pad_left) + seq + (PAD_CHAR * pad_right)
        
        if len(final_seq) != target_len:
             print(f"L·ªói ƒë·ªô d√†i khi get_ref_seq: {chrom}:{center_1based}. D·ª± ki·∫øn {target_len}, nh·∫≠n {len(final_seq)}")
             return PAD_CHAR * target_len
        return final_seq
        
    except Exception as e:
        print(f"L·ªói get_ref_seq: {e} t·∫°i {chrom}:{center_1based}")
        return PAD_CHAR * target_len

def verify_ref_seq_center(genome, chrom, canonical_center, ref_seq, window):
    """(H√†m ki·ªÉm tra) X√°c minh base ·ªü t√¢m `ref_seq` kh·ªõp v·ªõi FASTA."""
    try:
        truth_base = genome[chrom][canonical_center - 1].upper()
        test_base = ref_seq[window].upper()
        if truth_base == test_base:
            return True
        else:
            print(f"--- ‚ö†Ô∏è L·ªñI CƒÇN GI·ªÆA! ---")
            print(f"  T·ªça ƒë·ªô chu·∫©n: {chrom}:{canonical_center}")
            print(f"  Base 'S·ª± th·∫≠t' t·ª´ FASTA: {truth_base}")
            print(f"  Base 'Ki·ªÉm tra' t·∫°i t√¢m ref_seq[300]: {test_base}")
            return False
    except Exception as e:
        print(f"L·ªñI KI·ªÇM TRA (Exception): {e} t·∫°i {chrom}:{canonical_center}")
        return False

def normalize_centered_sequence(seq, center_index, target_len, pad_char='N' ):
    """Logic "C·∫Øt/ƒê·ªám ƒê·ªëi X·ª©ng" (Symmetric Crop/Pad)."""
    window = target_len // 2
    start = center_index - window
    end = center_index + window + 1
    pad_left = max(0, -start)
    pad_right = max(0, end - len(seq))
    crop_left = max(0, start)
    crop_right = min(len(seq), end)
    
    final_seq = (pad_char * pad_left) + seq[crop_left:crop_right] + (pad_char * pad_right)
    
    if len(final_seq) > target_len:
        over = len(final_seq) - target_len
        final_seq = final_seq[over//2 : -(over - over//2)]
    if len(final_seq) < target_len:
        needed = target_len - len(final_seq)
        final_seq += pad_char * needed
            
    return final_seq

In [3]:
# --- 3. T·∫£i D·ªØ li·ªáu ---
print(f"üß¨ ƒêang t·∫£i FASTA: {fasta_path}")
try:
    genome = Fasta(fasta_path, as_raw=True, sequence_always_upper=True)
except Exception as e:
    print(f"L·ªói nghi√™m tr·ªçng: Kh√¥ng th·ªÉ t·∫£i file FASTA. L·ªói: {e}")
    sys.exit(1)

print(f"üìä ƒêang ƒë·ªçc file VEP CSV: {annotated_variant_file}")
try:
    df = pd.read_csv(annotated_variant_file)
except Exception as e:
    print(f"L·ªói nghi√™m tr·ªçng: Kh√¥ng th·ªÉ ƒë·ªçc file CSV. L·ªói: {e}")
    sys.exit(1)

# Kh·ªüi t·∫°o c√°c c·ªôt m·ªõi
df['canonical_center'] = None
df['ref_seq'] = None
df['alt_seq'] = None

count_success = 0
count_fail = 0

üß¨ ƒêang t·∫£i FASTA: D:\Study\5-FA25\AiTa_Lab_Research\Map\Homo_sapiens.GRCh38.dna.primary_assembly.fa
üìä ƒêang ƒë·ªçc file VEP CSV: D:\Study\5-FA25\AiTa_Lab_Research\variant_annotated_official_clean.csv


In [4]:
# --- 4. V√≤ng l·∫∑p X·ª≠ l√Ω ---
total_rows = len(df)
print(f"‚öôÔ∏è B·∫Øt ƒë·∫ßu x·ª≠ l√Ω {total_rows} bi·∫øn th·ªÉ...")
for index, row in df.iterrows():
    if (index + 1) % 1000 == 0:
        print(f"  ...ƒê√£ x·ª≠ l√Ω {index + 1} / {total_rows}...")

    try:
        # 1. ƒê·ªçc d·ªØ li·ªáu (n·∫øu ƒë√£ qua filter)
        chrom = normalize_chrom(row['CHROM'])
        pos = int(row['POS'])
        ref_vcf = str(row['REF']).upper()
        alt_vcf = str(row['ALT']).upper()
        hgvsc_str = str(row['HGVSc'])
        consequence_str = str(row['Consequence'])
        # L·∫•y nh√£n ƒë·∫ßu ti√™n (nghi√™m tr·ªçng nh·∫•t)
        first_consequence = consequence_str.split(',')[0]
        
        # 2. T√¨m T√¢m Chu·∫©n (Canonical Center)
        offset = parse_hgvsc_offset(hgvsc_str)
        canonical_center = 0

        if first_consequence == 'splice_donor_variant' and offset is not None:
            canonical_center = pos - (offset - 1)
        elif first_consequence == 'splice_acceptor_variant' and offset is not None:
            canonical_center = pos - (offset + 1)
        else:
            # Missense, UTR, ho·∫∑c Splicing b·ªã l·ªói offset -> D√πng POS l√†m t√¢m
            canonical_center = pos
        
        # 3. T·∫°o ref_seq
        ref_seq = get_ref_seq(genome, chrom, canonical_center, WINDOW)
        
        if not verify_ref_seq_center(genome, chrom, canonical_center, ref_seq, WINDOW):
            print(f"  C·∫¢NH B√ÅO: B·ªè qua {chrom}:{pos} do l·ªói x√°c minh cƒÉn gi·ªØa.")
            count_fail += 1
            continue
            
        # 4. T·∫°o alt_seq
        # T√≠nh v·ªã tr√≠ t∆∞∆°ng ƒë·ªëi c·ªßa bi·∫øn th·ªÉ so v·ªõi t√¢m m·ªõi
        relative_pos = WINDOW + (pos - canonical_center)
        # Gh√©p chu·ªói ALT
        alt_seq_dynamic = ref_seq[:relative_pos] + alt_vcf + ref_seq[relative_pos + len(ref_vcf):]
        indel_len_change = len(alt_vcf) - len(ref_vcf)
        alt_center_index = WINDOW
        # N·∫øu bi·∫øn th·ªÉ ch√≠nh l√† t√¢m (Missense) -> relative_pos = WINDOW -> alt_center = WINDOW + change
        if relative_pos <= WINDOW:
            alt_center_index = WINDOW + indel_len_change
            
        final_alt_seq = normalize_centered_sequence(
            alt_seq_dynamic, 
            alt_center_index, 
            TARGET_LEN, 
            PAD_CHAR
        )

        # 5. C·∫≠p nh·∫≠t v√†o DataFrame
        df.at[index, 'canonical_center'] = canonical_center
        df.at[index, 'ref_seq'] = ref_seq
        df.at[index, 'alt_seq'] = final_alt_seq
        
        count_success += 1
        
    except Exception as e:
        print(f"L·ªói x·ª≠ l√Ω h√†ng {index}: {e}. D·ªØ li·ªáu h√†ng: {row.to_dict()}")

‚öôÔ∏è B·∫Øt ƒë·∫ßu x·ª≠ l√Ω 3617015 bi·∫øn th·ªÉ...
  ...ƒê√£ x·ª≠ l√Ω 1000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 2000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 3000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 4000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 5000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 6000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 7000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 8000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 9000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 10000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 11000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 12000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 13000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 14000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 15000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 16000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 17000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 18000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 19000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 20000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 21000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 22000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 23000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 24000 / 3617015...
  ...ƒê√£ x·ª≠ l√Ω 25000 / 3617015...
  ...ƒ

In [5]:
# --- 5. H·∫≠u x·ª≠ l√Ω v√† L∆∞u file ---
print("\nüßπ ƒêang l√†m s·∫°ch d·ªØ li·ªáu cu·ªëi c√πng...")


üßπ ƒêang l√†m s·∫°ch d·ªØ li·ªáu cu·ªëi c√πng...


In [6]:
# Check data missing
print('S·ªë missing value sau khi map: \n')
df.isnull().sum()

S·ªë missing value sau khi map: 



AlleleID                    0
Type                        0
Name                        0
GeneID                      0
GeneSymbol                635
HGNC_ID                  4512
ClinicalSignificance        0
ClinSigSimple               0
PhenotypeIDS            51711
PhenotypeList            2652
Origin                      0
OriginSimple                0
Assembly                    0
ChromosomeAccession         0
CHROM                       0
Start                       0
Stop                        0
ReviewStatus                0
VariationID                 0
POS                         0
REF                         0
ALT                         0
Uploaded_variation          0
Consequence                 0
HGVSc                       0
Feature                     0
Protein_id                  0
Protein_position            0
Amino_acids                 3
Codons                      0
canonical_center         3004
ref_seq                  3004
alt_seq                  3004
dtype: int

In [7]:
# 5.1. Lo·∫°i b·ªè nh·ªØng h√†ng kh√¥ng t·∫°o ƒë∆∞·ª£c sequence (gi√° tr·ªã v·∫´n l√† None)
df_final = df.dropna(subset=['ref_seq'])
# 5.2. Chu·∫©n h√≥a t√™n Chromosome output (h√†m c·ªßa b·∫°n)
df_final['CHROM'] = df_final['CHROM'].apply(normalize_chromosome_output)
# 5.3. Lo·∫°i b·ªè c√°c h√†ng CHROM b·ªã None (do Un/random/alt)
before_clean = len(df_final)
df_final = df_final.dropna(subset=['CHROM'])
after_clean = len(df_final)

print(f" - T·ªïng input: {total_rows}")
print(f" - Th√†nh c√¥ng: {count_success}")
print(f" - Th·∫•t b·∫°i/L·ªói Verify: {count_fail}")
print(f" - Lo·∫°i b·ªè do nhi·ªÖm s·∫Øc th·ªÉ r√°c: {before_clean - after_clean}")
print(f"‚úÖ T·ªïng s·ªë h√†ng h·ª£p l·ªá cu·ªëi c√πng: {len(df_final)}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['CHROM'] = df_final['CHROM'].apply(normalize_chromosome_output)


 - T·ªïng input: 3617015
 - Th√†nh c√¥ng: 3614011
 - Th·∫•t b·∫°i/L·ªói Verify: 3004
 - Lo·∫°i b·ªè do nhi·ªÖm s·∫Øc th·ªÉ r√°c: 0
‚úÖ T·ªïng s·ªë h√†ng h·ª£p l·ªá cu·ªëi c√πng: 3614011


In [8]:
df_final 

Unnamed: 0,AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,PhenotypeIDS,PhenotypeList,...,Consequence,HGVSc,Feature,Protein_id,Protein_position,Amino_acids,Codons,canonical_center,ref_seq,alt_seq
0,2193183,single nucleotide variant,NM_001005484.2(OR4F5):c.107A>G (p.Glu36Gly),79501,OR4F5,HGNC:14825,Likely benign,0,MedGen:CN169374,not specified,...,missense_variant,ENST00000641515.2:c.107A>G,ENST00000641515,-,36,E/G,gAa/gGa,69134,GAGACACTGAGAAGCCGAGATAACTGAATTATAAGGCATAGCCAGG...,GAGACACTGAGAAGCCGAGATAACTGAATTATAAGGCATAGCCAGG...
1,4039319,single nucleotide variant,NM_001005484.2(OR4F5):c.281A>G (p.Lys94Arg),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,missense_variant,ENST00000641515.2:c.281A>G,ENST00000641515,-,94,K/R,aAg/aGg,69308,CTGACTTCCTTCTCCTTCTCTTCTTCAAGGTAACTGCAGAGGCTAT...,CTGACTTCCTTCTCCTTCTCTTCTTCAAGGTAACTGCAGAGGCTAT...
2,3374047,single nucleotide variant,NM_001005484.2(OR4F5):c.287T>G (p.Ile96Ser),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,missense_variant,ENST00000641515.2:c.287T>G,ENST00000641515,-,96,I/S,aTt/aGt,69314,TCCTTCTCCTTCTCTTCTTCAAGGTAACTGCAGAGGCTATTTCCTG...,TCCTTCTCCTTCTCTTCTTCAAGGTAACTGCAGAGGCTATTTCCTG...
3,4039320,single nucleotide variant,NM_001005484.2(OR4F5):c.377T>C (p.Met126Thr),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,missense_variant,ENST00000641515.2:c.377T>C,ENST00000641515,-,126,M/T,aTg/aCg,69404,TCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTCCT...,TCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTCCT...
4,3374048,single nucleotide variant,NM_001005484.2(OR4F5):c.396G>A (p.Met132Ile),79501,OR4F5,HGNC:14825,Uncertain significance,0,MedGen:CN169374,not specified,...,missense_variant,ENST00000641515.2:c.396G>A,ENST00000641515,-,132,M/I,atG/atA,69423,TGATTCTCAGGAACTCCAGACCTTCCTATTTATGTTGTTTTTTGTA...,TGATTCTCAGGAACTCCAGACCTTCCTATTTATGTTGTTTTTTGTA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3617010,706348,single nucleotide variant,NM_002186.3(IL9R):c.991G>A (p.Gly331Arg),3581,IL9R,HGNC:6030,Benign,0,MedGen:C3661900,not provided,...,missense_variant,ENST00000711289.2:c.991G>A,ENST00000711289,-,331,G/R,Ggg/Agg,57196354,CATCTGTACCTCGGTTCAGGCTGCCGTGGGCACCAGGCCCTGTGCT...,CATCTGTACCTCGGTTCAGGCTGCCGTGGGCACCAGGCCCTGTGCT...
3617011,706349,single nucleotide variant,NM_002186.3(IL9R):c.1094G>A (p.Arg365His),3581,IL9R,HGNC:6030,Benign,0,MedGen:C3661900,not provided,...,missense_variant,ENST00000711289.2:c.1094G>A,ENST00000711289,-,365,R/H,cGt/cAt,57196457,ACATGGAGGAAAGACAGAATGTCCAAGACACAGGCGCTGCTTGGCC...,ACATGGAGGAAAGACAGAATGTCCAAGACACAGGCGCTGCTTGGCC...
3617012,4197692,single nucleotide variant,NC_000023.11:g.156022415C>A,-1,,,Likely benign,0,MedGen:C3661900,not provided,...,synonymous_variant,ENST00000711285.2:c.540C>A,ENST00000711285,-,180,I,atC/atA,57208935,GCACCGCCCCCTGGACGAGCGGGCCCTGCAGGTCTGCTGGCTGCGC...,GCACCGCCCCCTGGACGAGCGGGCCCTGCAGGTCTGCTGGCTGCGC...
3617013,3184976,single nucleotide variant,NC_000023.11:g.156023032T>C,-1,,,Likely benign,0,MedGen:C3661900,not provided,...,synonymous_variant,ENST00000711285.2:c.741T>C,ENST00000711285,-,247,Y,taT/taC,57209552,TGGTGCTGTAACAAAGACCCATGTGATGCTGGGGGCAGAGACAGAG...,TGGTGCTGTAACAAAGACCCATGTGATGCTGGGGGCAGAGACAGAG...


In [9]:
# L∆∞u K·∫øt qu·∫£
if not df_final.empty:
    print(f"üíæ ƒêang l∆∞u k·∫øt qu·∫£ v√†o: {output_file}")
    df_final.to_csv(output_file, index=False)
    print("üéâ Ho√†n th√†nh!")
else:
    print("‚ö†Ô∏è C·∫£nh b√°o: File k·∫øt qu·∫£ r·ªóng!")

üíæ ƒêang l∆∞u k·∫øt qu·∫£ v√†o: D:\Study\5-FA25\AiTa_Lab_Research\variant_sequence_positive.csv
üéâ Ho√†n th√†nh!
