In [1]:
import pandas as pd
from pyfaidx import Fasta
import random
from tqdm import tqdm
import os
import csv

# ================= 1. C·∫§U H√åNH H·ªÜ TH·ªêNG (QUAN TR·ªåNG) =================
# H√£y ƒë·∫£m b·∫£o ƒë∆∞·ªùng d·∫´n file ch√≠nh x√°c
GTF_FILE = r'D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\gencode.v49.basic.annotation.gtf'      # File GTF
FASTA_FILE = r'D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\Homo_sapiens.GRCh38.dna.primary_assembly.fa'   # File FASTA
OUTPUT_FILE = r'D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv'       # File k·∫øt qu·∫£ (~9-10GB)

# C·∫•u h√¨nh Sequence
WINDOW = 300                # B√°n k√≠nh 200bp
SEQ_LEN = (WINDOW * 2) + 1  # T·ªïng ƒë·ªô d√†i 401bp (T√¢m t·∫°i index 200)
WRITE_BATCH_SIZE = 10000    # Ghi ƒëƒ©a m·ªói 10k d√≤ng ƒë·ªÉ ti·∫øt ki·ªám RAM

# C·∫•u h√¨nh T·ªâ l·ªá (Ratio)
# B·∫°n mu·ªën 100 Negative : 1 Donor : 1 Acceptor
# T·ª©c l√† Negative = 50 l·∫ßn (Donor + Acceptor)
NEGATIVE_MULTIPLIER = 50 

# Danh s√°ch nhi·ªÖm s·∫Øc th·ªÉ chu·∫©n (B·ªè qua c√°c m·∫£nh contig r√°c ƒë·ªÉ data s·∫°ch)
VALID_CHROMS = set([str(i) for i in range(1, 23)] + ['X', 'Y', 'M', 'MT'])

# ================= 2. C√ÅC H√ÄM X·ª¨ L√ù C·ªêT L√ïI =================

def normalize_chrom(chrom_name):
    """
    Chu·∫©n h√≥a t√™n nhi·ªÖm s·∫Øc th·ªÉ ƒë·ªÉ kh·ªõp gi·ªØa GTF v√† FASTA.
    Gi·∫£i quy·∫øt v·∫•n ƒë·ªÅ: 'chr1' vs '1', 'NC_000001' vs '1'.
    """
    s = str(chrom_name).strip()
    
    # X·ª≠ l√Ω d·∫°ng RefSeq (NC_...)
    if s.startswith("NC_"):
        try:
            val = int(s.split('_')[1].split('.')[0])
            if val == 23: return 'X'
            if val == 24: return 'Y'
            if val == 12920: return 'M'
            return str(val)
        except: return s
        
    # X·ª≠ l√Ω d·∫°ng UCSC (chr...)
    if s.lower().startswith('chr'):
        raw = s[3:]
        if raw == 'M': return 'M'
        return raw
        
    return s

def get_sequence_strict(genome, raw_chrom, center_pos, strand, window):
    """
    Tr√≠ch xu·∫•t sequence t·∫°i t√¢m (1-based).
    Y√™u c·∫ßu: Kh√¥ng padding, kh√¥ng 'N', ƒë·∫£o m·∫°ch n·∫øu l√† Strand (-).
    """
    norm_chrom = normalize_chrom(raw_chrom)
    
    # T√¨m key t∆∞∆°ng ·ª©ng trong file FASTA
    fasta_key = None
    if norm_chrom in genome: fasta_key = norm_chrom
    elif f"chr{norm_chrom}" in genome: fasta_key = f"chr{norm_chrom}"
    elif norm_chrom == 'M' and 'chrM' in genome: fasta_key = 'chrM'
    
    if fasta_key is None: return None # Kh√¥ng t√¨m th·∫•y chromosome

    # T√≠nh t·ªça ƒë·ªô 0-based
    start_idx = center_pos - 1 - window
    end_idx = center_pos + window
    
    # Check bi√™n gi·ªõi
    if start_idx < 0 or end_idx >= len(genome[fasta_key]): return None
        
    try:
        seq_obj = genome[fasta_key][start_idx : end_idx]
        seq = seq_obj.seq.upper()
        
        # Check ch·∫•t l∆∞·ª£ng
        if len(seq) != (window * 2 + 1): return None
        if 'N' in seq: return None # Lo·∫°i b·ªè sequence ch·ª©a N

        # ƒê·∫£o m·∫°ch (Reverse Complement) cho Strand (-)
        if strand == '-':
            mapping = str.maketrans("ATCG", "TAGC")
            seq = seq.translate(mapping)[::-1]
            
        return seq
    except: return None

In [2]:
# ================= 3. CH∆Ø∆†NG TR√åNH CH√çNH (STREAMING) =================

print(f"--- GENERATING MASSIVE DATASET (Ratio 100:1:1) ---")
print(f"Target: ~200k Donor, ~200k Acceptor, ~20M Negatives")

# 1. Load Resources
if not os.path.exists(FASTA_FILE):
    print(f"L·ªñI: Kh√¥ng t√¨m th·∫•y {FASTA_FILE}"); 
if not os.path.exists(GTF_FILE):
    print(f"L·ªñI: Kh√¥ng t√¨m th·∫•y {GTF_FILE}"); 
print("[1/4] Loading Genome (Lazy Load)...")
genome = Fasta(FASTA_FILE, sequence_always_upper=True)

--- GENERATING MASSIVE DATASET (Ratio 100:1:1) ---
Target: ~200k Donor, ~200k Acceptor, ~20M Negatives
[1/4] Loading Genome (Lazy Load)...


In [3]:
print("[2/4] Parsing GTF...")
# ƒê·ªçc GTF, l·ªçc l·∫•y Exon c·ªßa Protein Coding Genes
df = pd.read_csv(GTF_FILE, sep='\t', comment='#', header=None, usecols=[0, 2, 3, 4, 6, 8],
                    names=['chrom', 'feature', 'start', 'end', 'strand', 'attribute'])
df = df[(df['feature'] == 'exon') & (df['attribute'].str.contains('gene_type "protein_coding"'))]
print(f"      -> {len(df)} exons found.")

[2/4] Parsing GTF...
      -> 2241203 exons found.


In [4]:
# M·ªü file CSV ƒë·ªÉ ghi lu·ªìng (Stream Writing)
with open(OUTPUT_FILE, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'dna', 'label']) # Header

    # --- B∆Ø·ªöC 3: X·ª¨ L√ù POSITIVE (Labels 1 & 2) ---
    print("[3/4] Processing Positives...")
    seen_sites = set() # Set kh·ª≠ tr√πng l·∫∑p
    pos_buffer = []    # B·ªô nh·ªõ ƒë·ªám t·∫°m th·ªùi
    total_positives = 0
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Positives"):
        raw_chrom = row['chrom']
        norm_chrom = normalize_chrom(raw_chrom)
        if norm_chrom not in VALID_CHROMS: continue
        strand = row['strand']

        # T√≠nh t·ªça ƒë·ªô t√¢m (Center at G)
        if strand == '+':
            donor_center = row['end'] + 1
            acc_center = row['start'] - 1
        else:
            donor_center = row['start'] - 1
            acc_center = row['end'] + 1

        # --- LABEL 1: DONOR (GT & GC) ---
        if (norm_chrom, donor_center, strand, 1) not in seen_sites:
            seq = get_sequence_strict(genome, raw_chrom, donor_center, strand, WINDOW)
            if seq:
                # Ki·ªÉm tra t√¢m: Ch·∫•p nh·∫≠n GT v√† GC
                center_motif = seq[WINDOW : WINDOW+2]
                if center_motif in ['GT', 'GC']:
                    pos_buffer.append([f"Donor_{norm_chrom}_{donor_center}_{strand}", seq, 1])
                    seen_sites.add((norm_chrom, donor_center, strand, 1))
                    total_positives += 1

        # --- LABEL 2: ACCEPTOR (AG) ---
        if (norm_chrom, acc_center, strand, 2) not in seen_sites:
            seq = get_sequence_strict(genome, raw_chrom, acc_center, strand, WINDOW)
            if seq:
                # Ki·ªÉm tra t√¢m: Ch·∫•p nh·∫≠n AG
                center_motif = seq[WINDOW-1 : WINDOW+1]
                if center_motif == 'AG':
                    pos_buffer.append([f"Acc_{norm_chrom}_{acc_center}_{strand}", seq, 2])
                    seen_sites.add((norm_chrom, acc_center, strand, 2))
                    total_positives += 1
        
        # Ghi v√†o ƒëƒ©a khi buffer ƒë·∫ßy (tr√°nh t·ªën RAM)
        if len(pos_buffer) >= WRITE_BATCH_SIZE:
            writer.writerows(pos_buffer)
            pos_buffer = [] 

    # Ghi n·ªët ph·∫ßn d∆∞
    if pos_buffer:
        writer.writerows(pos_buffer)
        pos_buffer = [] # Clear RAM ho√†n to√†n
        
    print(f"      -> ƒê√£ ghi {total_positives} m·∫´u Positive s·∫°ch.")
    
# --- B∆Ø·ªöC 4: SINH D·ªÆ LI·ªÜU NEGATIVE KH·ªîNG L·ªí (ƒê√É S·ª¨A L·ªñI DUPLICATE) ---
    target_neg = total_positives * NEGATIVE_MULTIPLIER
    print(f"[4/4] Generating {target_neg} Negatives (Ratio 100:1:1)...")
    print("      L∆∞u √Ω: B∆∞·ªõc n√†y s·∫Ω t·ªën RAM h∆°n ƒë·ªÉ l∆∞u set check tr√πng.")
    
    neg_count = 0
    neg_buffer = []
    
    # Iterator tu·∫ßn ho√†n
    df_sample = df.sample(frac=1).reset_index(drop=True)
    iter_rows = iter(df_sample.iterrows())
    
    pbar = tqdm(total=target_neg, desc="Mining Negatives")
    
    while neg_count < target_neg:
        try:
            _, row = next(iter_rows)
        except StopIteration:
            df_sample = df.sample(frac=1).reset_index(drop=True)
            iter_rows = iter(df_sample.iterrows())
            _, row = next(iter_rows)
        
        raw_chrom = row['chrom']
        norm_chrom = normalize_chrom(raw_chrom)
        if norm_chrom not in VALID_CHROMS: continue
        strand = row['strand']
        
        # Random v·ªã tr√≠
        rand_pos = random.randint(row['start'] - 10000, row['end'] + 10000)
        
        # --- [S·ª¨A ƒê·ªîI QUAN TR·ªåNG] ---
        # T·∫°o key ƒë·∫°i di·ªán cho m·∫´u Negative ƒë·ªãnh l·∫•y
        neg_key = (norm_chrom, rand_pos, strand, 0)
        
        # 1. Check tr√πng Positive (Site th·∫≠t)
        if (norm_chrom, rand_pos, strand, 1) in seen_sites or \
           (norm_chrom, rand_pos, strand, 2) in seen_sites:
            continue

        # 2. Check tr√πng Negative ƒë√£ l·∫•y tr∆∞·ªõc ƒë√≥ (FIX BUG DUPLICATE)
        if neg_key in seen_sites:
            continue
        # ----------------------------

        seq = get_sequence_strict(genome, raw_chrom, rand_pos, strand, WINDOW)
        if not seq: continue
        
        # Hard Negative Check
        center = seq[WINDOW]
        next_b = seq[WINDOW+1]
        prev_b = seq[WINDOW-1]
        
        is_fake_donor = (center == 'G' and next_b in ['T', 'C'])
        is_fake_acc   = (prev_b == 'A' and center == 'G')
        
        if is_fake_donor or is_fake_acc:
            neg_buffer.append([f"Neg_{norm_chrom}_{rand_pos}_{strand}", seq, 0])
            
            # --- [TH√äM M·ªöI] ƒê√°nh d·∫•u ƒë√£ s·ª≠ d·ª•ng ---
            seen_sites.add(neg_key) 
            # --------------------------------------

            neg_count += 1
            pbar.update(1)
            
            if len(neg_buffer) >= WRITE_BATCH_SIZE:
                writer.writerows(neg_buffer)
                neg_buffer = []

    # Ghi n·ªët ph·∫ßn cu·ªëi
    if neg_buffer:
        writer.writerows(neg_buffer)

print(f"‚úÖ HO√ÄN T·∫§T! File saved at: {OUTPUT_FILE}")
print(f"      Dung l∆∞·ª£ng ∆∞·ªõc t√≠nh: ~9.5 GB")
print(f"      T·ªïng s·ªë d√≤ng: {total_positives + neg_count}")
print(f"‚ö†Ô∏è QUAN TR·ªåNG: File n√†y ch∆∞a ƒë∆∞·ª£c shuffle (Positive n·∫±m ƒë·∫ßu, Negative n·∫±m cu·ªëi).")
print(f"   H√£y shuffle khi load v√†o model ho·∫∑c d√πng l·ªánh Linux: 'shuf {OUTPUT_FILE} > shuffled.csv'")

[3/4] Processing Positives...


Positives: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2241203/2241203 [03:34<00:00, 10464.88it/s]


      -> ƒê√£ ghi 496576 m·∫´u Positive s·∫°ch.
[4/4] Generating 24828800 Negatives (Ratio 100:1:1)...
      L∆∞u √Ω: B∆∞·ªõc n√†y s·∫Ω t·ªën RAM h∆°n ƒë·ªÉ l∆∞u set check tr√πng.


Mining Negatives: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 24828773/24828800 [6:25:06<00:00, 942.85it/s]   

‚úÖ HO√ÄN T·∫§T! File saved at: D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv
      Dung l∆∞·ª£ng ∆∞·ªõc t√≠nh: ~9.5 GB
      T·ªïng s·ªë d√≤ng: 25325376
‚ö†Ô∏è QUAN TR·ªåNG: File n√†y ch∆∞a ƒë∆∞·ª£c shuffle (Positive n·∫±m ƒë·∫ßu, Negative n·∫±m cu·ªëi).
   H√£y shuffle khi load v√†o model ho·∫∑c d√πng l·ªánh Linux: 'shuf D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv > shuffled.csv'


In [5]:
import csv
import os
from tqdm import tqdm

# --- C·∫§U H√åNH ---
TARGET_FILE = r'D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv'
TEMP_FILE = TARGET_FILE + '.tmp'
NEW_CHROM_COL = 'CHROM'

# Map ƒë·ªïi t√™n: {'T√™n c≈©': 'T√™n m·ªõi'}
RENAME_MAP = {
    'label': 'Splicing_types',
    'dna': 'sequence'
}

print(f"üîÑ ƒêang x·ª≠ l√Ω file: {TARGET_FILE}")
print(f"   - ƒê·ªïi t√™n c·ªôt: {RENAME_MAP}")
print(f"   - Th√™m/Update c·ªôt: {NEW_CHROM_COL}")

try:
    with open(TARGET_FILE, 'r', encoding='utf-8') as f_in, \
         open(TEMP_FILE, 'w', encoding='utf-8', newline='') as f_out:
        
        reader = csv.reader(f_in)
        writer = csv.writer(f_out)
        
        # 1. X·ª¨ L√ù HEADER (ƒê·ªïi t√™n & Th√™m c·ªôt)
        try:
            original_header = next(reader)
        except StopIteration:
            print("‚ùå File r·ªóng!")
            raise Exception("File empty")
            
        # B∆∞·ªõc A: ƒê·ªïi t√™n c√°c c·ªôt c≈© (label -> Splicing_types, dna -> sequence)
        renamed_header = [RENAME_MAP.get(col, col) for col in original_header]
        
        # B∆∞·ªõc B: Ki·ªÉm tra c·ªôt CHROM
        if NEW_CHROM_COL in renamed_header:
            print(f"‚ö†Ô∏è C·ªôt '{NEW_CHROM_COL}' ƒë√£ t·ªìn t·∫°i -> S·∫Ω c·∫≠p nh·∫≠t d·ªØ li·ªáu.")
            final_header = renamed_header
            chrom_idx = renamed_header.index(NEW_CHROM_COL)
        else:
            final_header = renamed_header + [NEW_CHROM_COL]
            chrom_idx = None # Ch∆∞a c√≥, s·∫Ω append v√†o cu·ªëi
            
        writer.writerow(final_header)
        
        # 2. X·ª¨ L√ù D·ªÆ LI·ªÜU (Streaming)
        for row in tqdm(reader, desc="Processing Rows", unit=" rows"):
            if not row: continue
            
            # Logic l·∫•y Chromosome t·ª´ ID (Donor_1_65434_+)
            row_id = row[0]
            try:
                parts = row_id.split('_')
                if len(parts) > 1:
                    raw_chrom = parts[1]
                    # Format: chr1, chr2, chrX...
                    chrom_val = raw_chrom if raw_chrom.startswith('chr') else f"chr{raw_chrom}"
                else:
                    chrom_val = "unknown"
                
                # Logic ghi v√†o d√≤ng
                if chrom_idx is not None:
                    # N·∫øu c·ªôt CHROM ƒë√£ c√≥ s·∫µn (do ch·∫°y l·∫°i nhi·ªÅu l·∫ßn) -> Ghi ƒë√®
                    if len(row) > chrom_idx:
                        row[chrom_idx] = chrom_val
                    else:
                        row.append(chrom_val)
                    writer.writerow(row)
                else:
                    # N·∫øu ch∆∞a c√≥ -> Th√™m m·ªõi v√†o cu·ªëi
                    writer.writerow(row + [chrom_val])
                    
            except Exception:
                # Fallback an to√†n cho d√≤ng l·ªói
                writer.writerow(row + (['unknown'] if chrom_idx is None else []))

    # 3. Thay th·∫ø file c≈©
    os.replace(TEMP_FILE, TARGET_FILE)
    print(f"‚úÖ HO√ÄN T·∫§T! File ƒë√£ ƒë∆∞·ª£c c·∫≠p nh·∫≠t.")
    print(f"   Header m·ªõi: {final_header}")

except FileNotFoundError:
    print(f"‚ùå L·ªói: Kh√¥ng t√¨m th·∫•y file '{TARGET_FILE}'.")
except Exception as e:
    if os.path.exists(TEMP_FILE): os.remove(TEMP_FILE)
    print(f"‚ùå L·ªói Runtime: {e}")

üîÑ ƒêang x·ª≠ l√Ω file: D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv
   - ƒê·ªïi t√™n c·ªôt: {'label': 'Splicing_types', 'dna': 'sequence'}
   - Th√™m/Update c·ªôt: CHROM


Processing Rows: 25325376 rows [08:53, 47475.31 rows/s]:25:19<00:00, 942.85it/s]


‚úÖ HO√ÄN T·∫§T! File ƒë√£ ƒë∆∞·ª£c c·∫≠p nh·∫≠t.
   Header m·ªõi: ['id', 'sequence', 'Splicing_types', 'CHROM']


In [6]:
import csv
import sys
import time
from collections import Counter
from tqdm import tqdm

# ================= C·∫§U H√åNH =================
# S·ª≠a ƒë∆∞·ªùng d·∫´n file n·∫øu c·∫ßn
FILE_PATH = r'D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv'
SAMPLE_ROWS = 5  # S·ªë d√≤ng in m·∫´u

def inspect_and_validate_full(file_path):
    print(f"üöÄ B·∫ÆT ƒê·∫¶U KI·ªÇM TRA FILE (C·∫§U TR√öC M·ªöI): {file_path}")
    
    start_time = time.time()
    
    # 1. BI·∫æN TH·ªêNG K√ä
    stats = {
        "total_rows": 0,
        "null_counts": {},
        "seq_lengths": Counter(),      # Th·ªëng k√™ ƒë·ªô d√†i sequence
        "label_counts": Counter(),     # Th·ªëng k√™ Splicing_types
        "chrom_counts": Counter(),     # Th·ªëng k√™ CHROM (M·ªõi)
        "samples": []
    }
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            
            # --- 2. ƒê·ªåC & MAP HEADER ---
            try:
                header = next(reader)
                # Kh·ªüi t·∫°o ƒë·∫øm null
                for col in header: stats["null_counts"][col] = 0
                
                # T√¨m v·ªã tr√≠ (index) c·ªßa c√°c c·ªôt quan tr·ªçng theo t√™n m·ªõi
                # D√πng next(..., None) ƒë·ªÉ tr√°nh l·ªói n·∫øu kh√¥ng t√¨m th·∫•y c·ªôt
                idx_seq = next((i for i, c in enumerate(header) if c in ['sequence', 'dna']), None)
                idx_lbl = next((i for i, c in enumerate(header) if c in ['Splicing_types', 'label']), None)
                idx_chr = next((i for i, c in enumerate(header) if c in ['CHROM', 'chrom']), None)
                
                print(f"   -> Header t√¨m th·∫•y: {header}")
                print(f"   -> Mapping index: Seq={idx_seq}, Label={idx_lbl}, Chrom={idx_chr}")
                
            except StopIteration:
                print("‚ùå L·ªñI: File r·ªóng!")
                return

            # --- 3. QU√âT D·ªÆ LI·ªÜU (SINGLE PASS) ---
            pbar = tqdm(reader, desc="Analyzing", unit=" rows")
            
            for i, row in enumerate(pbar):
                stats["total_rows"] += 1
                
                # A. L·∫§Y M·∫™U
                if i < SAMPLE_ROWS:
                    stats["samples"].append(row)
                
                # B. CHECK NULL & VALIDATE
                for col_idx, val in enumerate(row):
                    if col_idx >= len(header): break
                    if not val.strip():
                        col_name = header[col_idx]
                        stats["null_counts"][col_name] += 1
                
                # C. TH·ªêNG K√ä GI√Å TR·ªä (D·ª±a tr√™n index ƒë√£ map)
                if idx_seq is not None and len(row) > idx_seq:
                    stats["seq_lengths"][len(row[idx_seq])] += 1
                    
                if idx_lbl is not None and len(row) > idx_lbl:
                    stats["label_counts"][row[idx_lbl]] += 1
                    
                if idx_chr is not None and len(row) > idx_chr:
                    stats["chrom_counts"][row[idx_chr]] += 1

    except FileNotFoundError:
        print(f"‚ùå L·ªói: Kh√¥ng t√¨m th·∫•y file {file_path}")
        return

    # --- 4. B√ÅO C√ÅO K·∫æT QU·∫¢ ---
    elapsed = time.time() - start_time
    total = stats["total_rows"]
    
    print("\n" + "="*80)
    print(f"üìä B√ÅO C√ÅO D·ªÆ LI·ªÜU (Time: {elapsed:.2f}s | Total: {total:,} rows)")
    print("="*80)

    # 4.1 B·∫£ng m·∫´u (C·∫≠p nh·∫≠t 4 c·ªôt)
    print(f"\n1. M·∫™U D·ªÆ LI·ªÜU ({SAMPLE_ROWS} d√≤ng ƒë·∫ßu):")
    # Format in: ID | Sequence | Type | Chrom
    print(f"   {'ID':<25} | {'Sequence (Preview)':<20} | {'Type':<10} | {'Chrom':<8}")
    print(f"   {'-'*25} | {'-'*20} | {'-'*10} | {'-'*8}")
    
    for row in stats["samples"]:
        # L·∫•y gi√° tr·ªã an to√†n (ph√≤ng tr∆∞·ªùng h·ª£p d√≤ng thi·∫øu c·ªôt)
        _id = row[0] if len(row) > 0 else ""
        _seq = row[idx_seq] if idx_seq is not None and len(row) > idx_seq else "N/A"
        _lbl = row[idx_lbl] if idx_lbl is not None and len(row) > idx_lbl else "N/A"
        _chr = row[idx_chr] if idx_chr is not None and len(row) > idx_chr else "N/A"
        
        # C·∫Øt ng·∫Øn sequence
        seq_show = _seq[:6] + "..." + _seq[-4:] if len(_seq) > 10 else _seq
        print(f"   {_id:<25} | {seq_show:<20} | {_lbl:<10} | {_chr:<8}")

    # 4.2 B√°o c√°o Null
    print(f"\n2. KI·ªÇM TRA NULL (COMPLETENESS):")
    has_null = False
    print(f"   {'T√™n C·ªôt':<20} | {'S·ªë Null':<10} | {'% L·ªói'}")
    for col, count in stats["null_counts"].items():
        pct = (count / total * 100) if total > 0 else 0
        status = "‚úÖ" if count == 0 else f"‚ùå {count:,}"
        print(f"   {col:<20} | {status:<10} | {pct:.4f}%")
        if count > 0: has_null = True
    if not has_null: print("   -> S·∫°ch 100%.")

    # 4.3 Ph√¢n b·ªë Label
    print(f"\n3. PH√ÇN B·ªê TYPE (Splicing_types):")
    for lbl, count in stats["label_counts"].items():
        print(f"   - '{lbl}': {count:,} m·∫´u")

    # 4.4 Ph√¢n b·ªë Chrom (M·ªõi)
    print(f"\n4. PH√ÇN B·ªê CHROMOSOME (Top 5):")
    # Ch·ªâ in 5 chrom ph·ªï bi·∫øn nh·∫•t ƒë·ªÉ ƒë·ª° d√†i d√≤ng
    for ch, count in stats["chrom_counts"].most_common(5):
        print(f"   - {ch}: {count:,} m·∫´u")
    if len(stats["chrom_counts"]) > 5: print(f"   ... v√† {len(stats['chrom_counts'])-5} chrom kh√°c.")

    # 4.5 Sequence Length
    print(f"\n5. SEQUENCE LENGTH:")
    for length, count in stats["seq_lengths"].items():
        status = "‚úÖ Chu·∫©n" if length == 601 else "‚ö†Ô∏è L·ªÜCH"
        print(f"   - Length {length}: {count:,} m·∫´u ({status})")

if __name__ == "__main__":
    inspect_and_validate_full(FILE_PATH)

üöÄ B·∫ÆT ƒê·∫¶U KI·ªÇM TRA FILE (C·∫§U TR√öC M·ªöI): D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv
   -> Header t√¨m th·∫•y: ['id', 'sequence', 'Splicing_types', 'CHROM']
   -> Mapping index: Seq=1, Label=2, Chrom=3


Analyzing: 25325376 rows [03:36, 116985.01 rows/s]


üìä B√ÅO C√ÅO D·ªÆ LI·ªÜU (Time: 216.53s | Total: 25,325,376 rows)

1. M·∫™U D·ªÆ LI·ªÜU (5 d√≤ng ƒë·∫ßu):
   ID                        | Sequence (Preview)   | Type       | Chrom   
   ------------------------- | -------------------- | ---------- | --------
   Donor_1_65434_+           | AAAAGT...TTAA        | 1          | chr1    
   Donor_1_65574_+           | GATAGC...TTCC        | 1          | chr1    
   Acc_1_65519_+             | CTTTAT...CTCC        | 2          | chr1    
   Acc_1_69036_+             | AAAGGA...GCGC        | 2          | chr1    
   Donor_1_924949_+          | ACCTCA...GAGC        | 1          | chr1    

2. KI·ªÇM TRA NULL (COMPLETENESS):
   T√™n C·ªôt              | S·ªë Null    | % L·ªói
   id                   | ‚úÖ          | 0.0000%
   sequence             | ‚úÖ          | 0.0000%
   Splicing_types       | ‚úÖ          | 0.0000%
   CHROM                | ‚úÖ          | 0.0000%
   -> S·∫°ch 100%.

3. PH√ÇN B·ªê TYPE (Splicing_types):
   - '1': 250,830 




In [7]:
import csv
import sys
import time
from collections import Counter
from tqdm import tqdm

# ================= C·∫§U H√åNH =================
FILE_PATH = r'D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv'
WINDOW = 300 # B√°n k√≠nh nh∆∞ thi·∫øt l·∫≠p ban ƒë·∫ßu
CENTER_IDX = WINDOW # V·ªã tr√≠ t√¢m (300)

def validate_biological_data(file_path):
    print(f"üß¨ B·∫ÆT ƒê·∫¶U KI·ªÇM TRA LOGIC SINH H·ªåC: {file_path}")
    
    stats = {
        "total": 0,
        "invalid_chars": 0,   # Sequence ch·ª©a k√Ω t·ª± l·∫°
        "motif_errors": 0,    # Sai Motif (Donor k ph·∫£i GT, Acc k ph·∫£i AG)
        "label_dist": Counter(),
        "chrom_dist": Counter()
    }
    
    valid_bases = set("ACGTN")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            header = next(reader, None)
            
            # Map index c·ªôt (ƒë·ªÉ code ch·∫°y ƒë√∫ng d√π c·ªôt ƒë·ªïi ch·ªó)
            # T√¨m c·ªôt theo t√™n m·ªõi ho·∫∑c c≈©
            try:
                idx_seq = next(i for i, c in enumerate(header) if c in ['sequence', 'dna'])
                idx_lbl = next(i for i, c in enumerate(header) if c in ['Splicing_types', 'label'])
                idx_chr = next(i for i, c in enumerate(header) if c in ['CHROM', 'chrom'])
            except StopIteration:
                print("‚ùå L·ªói: Kh√¥ng t√¨m th·∫•y ƒë·ªß c√°c c·ªôt sequence, label, CHROM!")
                return

            print(f"   -> Mapping OK: Seq={idx_seq}, Label={idx_lbl}, Chrom={idx_chr}")

            # QU√âT D·ªÆ LI·ªÜU
            for row in tqdm(reader, desc="Bio-Checking", unit=" rows"):
                if not row: continue
                stats["total"] += 1
                
                seq = row[idx_seq]
                label = row[idx_lbl]
                chrom = row[idx_chr]
                
                stats["label_dist"][label] += 1
                stats["chrom_dist"][chrom] += 1
                
                # 1. CHECK K√ù T·ª∞ L·∫† (Vocabulary)
                # D√πng set check cho nhanh
                if not set(seq).issubset(valid_bases):
                    stats["invalid_chars"] += 1

                # 2. CHECK MOTIF SINH H·ªåC (Quan tr·ªçng!)
                # Ch·ªâ check n·∫øu ƒë·ªô d√†i sequence ƒë√∫ng 601
                if len(seq) == (WINDOW * 2 + 1):
                    # Donor (Label 1): T√¢m ph·∫£i l√† GT ho·∫∑c GC
                    # Logic c·∫Øt: seq[300:302]
                    if label == '1':
                        motif = seq[CENTER_IDX : CENTER_IDX+2]
                        if motif not in ['GT', 'GC']:
                            stats["motif_errors"] += 1
                            
                    # Acceptor (Label 2): T√¢m ph·∫£i l√† AG
                    # Logic c·∫Øt: seq[299:301] (V√¨ AG n·∫±m ngay tr∆∞·ªõc exon)
                    elif label == '2':
                        motif = seq[CENTER_IDX-1 : CENTER_IDX+1]
                        if motif != 'AG':
                            stats["motif_errors"] += 1

    except Exception as e:
        print(f"‚ùå L·ªói Runtime: {e}")
        return

    # B√ÅO C√ÅO
    total = stats["total"]
    print("\n" + "="*60)
    print(f"üî¨ K·∫æT QU·∫¢ KI·ªÇM TRA SINH H·ªåC ({total:,} m·∫´u)")
    print("="*60)
    
    # 1. K√Ω t·ª± l·∫°
    status_char = "‚úÖ S·∫°ch" if stats["invalid_chars"] == 0 else f"‚ùå {stats['invalid_chars']:,} m·∫´u l·ªói"
    print(f"1. T·ª™ V·ª∞NG DNA (Ch·ªâ A,C,G,T,N): {status_char}")
    
    # 2. Motif
    # L∆∞u √Ω: N·∫øu Motif l·ªói nhi·ªÅu -> Code sinh d·ªØ li·ªáu b·ªã l·ªách index slicing
    pct_motif = (stats["motif_errors"] / total * 100)
    status_motif = "‚úÖ Chu·∫©n x√°c" if pct_motif < 1.0 else f"‚ö†Ô∏è ƒê√ÅNG NG·ªú ({stats['motif_errors']:,} m·∫´u sai motif)"
    print(f"2. MOTIF CONSISTENCY (GT/AG): {status_motif}")
    if stats["motif_errors"] > 0:
        print("   (N·∫øu s·ªë l∆∞·ª£ng l·ªói motif qu√° l·ªõn, h√£y ki·ªÉm tra l·∫°i logic c·∫Øt chu·ªói!)")

    # 3. Chromosome Check
    print(f"3. CHROMOSOME COVERAGE:")
    print(f"   - T√¨m th·∫•y {len(stats['chrom_dist'])} nhi·ªÖm s·∫Øc th·ªÉ.")
    # In c·∫£nh b√°o n·∫øu thi·∫øu chr1 (th∆∞·ªùng l√† do file GTF l·ªói ho·∫∑c filter sai)
    if 'chr1' not in stats['chrom_dist']:
        print("   ‚ö†Ô∏è C·∫¢NH B√ÅO: Kh√¥ng th·∫•y d·ªØ li·ªáu t·ª´ chr1!")

if __name__ == "__main__":
    validate_biological_data(FILE_PATH)

üß¨ B·∫ÆT ƒê·∫¶U KI·ªÇM TRA LOGIC SINH H·ªåC: D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv
   -> Mapping OK: Seq=1, Label=2, Chrom=3


Bio-Checking: 25325376 rows [05:15, 80376.29 rows/s]


üî¨ K·∫æT QU·∫¢ KI·ªÇM TRA SINH H·ªåC (25,325,376 m·∫´u)
1. T·ª™ V·ª∞NG DNA (Ch·ªâ A,C,G,T,N): ‚úÖ S·∫°ch
2. MOTIF CONSISTENCY (GT/AG): ‚úÖ Chu·∫©n x√°c
3. CHROMOSOME COVERAGE:
   - T√¨m th·∫•y 24 nhi·ªÖm s·∫Øc th·ªÉ.





In [8]:
import csv
import sys
import time
from collections import Counter
from tqdm import tqdm

# C·∫§U H√åNH FILE
FILE_PATH = r'D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv'
MAX_RAM_IDS = 30_000_000  # Gi·ªõi h·∫°n l∆∞u 30 tri·ªáu ID ƒë·ªÉ tr√°nh tr√†n RAM

def check_data_stats(file_path):
    print(f"üöÄ B·∫ÆT ƒê·∫¶U KI·ªÇM TRA FILE: {file_path}")
    
    # 1. Kh·ªüi t·∫°o bi·∫øn ƒë·∫øm
    stats = {
        "total_rows": 0,
        "null_rows": 0,       # D√≤ng c√≥ gi√° tr·ªã r·ªóng
        "duplicate_ids": 0,   # ID b·ªã tr√πng
    }
    label_counts = Counter()  # ƒê·∫øm Value Count cho Label
    seen_ids = set()          # Set l∆∞u ID ƒë·ªÉ check tr√πng
    check_dup = True          # C·ªù b·∫≠t/t·∫Øt check tr√πng (b·∫£o v·ªá RAM)

    start_time = time.time()

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            header = next(reader, None) # B·ªè qua header
            
            # Streaming Loop (ƒê·ªçc t·ª´ng d√≤ng -> X·ª≠ l√Ω -> X√≥a kh·ªèi RAM)
            for row in tqdm(reader, desc="Scanning", unit=" rows"):
                stats["total_rows"] += 1
                
                # Check c·∫•u tr√∫c c∆° b·∫£n (ƒë·ªÉ tr√°nh l·ªói index)
                if len(row) < 3: continue 
                _id, _dna, _label = row[0], row[1], row[2]

                # --- 1. CHECK NULL ---
                if not _id.strip() or not _dna.strip() or not _label.strip():
                    stats["null_rows"] += 1

                # --- 2. CHECK DUPLICATE ID ---
                if check_dup:
                    if _id in seen_ids:
                        stats["duplicate_ids"] += 1
                    else:
                        seen_ids.add(_id)
                    
                    # C∆° ch·∫ø an to√†n: T·ª± ng·∫Øt check l·∫∑p n·∫øu ID qu√° nhi·ªÅu (>30 tri·ªáu)
                    if len(seen_ids) > MAX_RAM_IDS:
                        print("‚ö†Ô∏è RAM Warning: T·∫°m d·ª´ng check Duplicate ƒë·ªÉ b·∫£o v·ªá m√°y.")
                        check_dup = False
                        seen_ids.clear() # Gi·∫£i ph√≥ng RAM

                # --- 3. VALUE COUNT (ƒê·∫øm Label) ---
                label_counts[_label] += 1

    except FileNotFoundError:
        print("‚ùå L·ªói: Kh√¥ng t√¨m th·∫•y file csv.")
        return

    # --- 4. TH·ªêNG K√ä CLASS (B√ÅO C√ÅO) ---
    total = stats["total_rows"]
    elapsed = time.time() - start_time

    print("\n" + "="*40)
    print(f"üìä B·∫¢NG TH·ªêNG K√ä D·ªÆ LI·ªÜU (Time: {elapsed:.2f}s)")
    print("="*40)
    
    print(f"1. T·ªîNG QUAN:")
    print(f"   - T·ªïng s·ªë d√≤ng: {total:,}")
    print(f"   - Null rows   : {stats['null_rows']:,}  ({(stats['null_rows']/total*100):.4f}%)")
    print(f"   - Duplicate ID: {stats['duplicate_ids']:,}")

    print(f"\n2. VALUE COUNT & CLASS STATISTICS:")
    print(f"   {'Label':<10} | {'Count':<15} | {'Percentage':<10}")
    print(f"   {'-'*10} | {'-'*15} | {'-'*10}")
    
    # S·∫Øp x·∫øp label theo th·ª© t·ª± 0, 1, 2
    for label in sorted(label_counts.keys()):
        count = label_counts[label]
        percent = (count / total * 100) if total > 0 else 0
        
        # Mapping t√™n class cho d·ªÖ hi·ªÉu
        label_name = label
        if label == '0': label_name = "0 (Neg)"
        elif label == '1': label_name = "1 (Donor)"
        elif label == '2': label_name = "2 (Acc)"
            
        print(f"   {label_name:<10} | {count:<15,} | {percent:.4f}%")

    print("="*40)

    # ƒê√°nh gi√° nhanh
    if stats['null_rows'] == 0 and stats['duplicate_ids'] == 0:
        print("‚úÖ D·ªØ li·ªáu S·∫†CH (Kh√¥ng Null, Kh√¥ng Tr√πng).")
    else:
        print("‚ö†Ô∏è D·ªØ li·ªáu C√ì V·∫§N ƒê·ªÄ. Vui l√≤ng ki·ªÉm tra l·∫°i logic sinh m·∫´u.")

if __name__ == "__main__":
    check_data_stats(FILE_PATH)

üöÄ B·∫ÆT ƒê·∫¶U KI·ªÇM TRA FILE: D:\Study\5-FA25\AiTa_Lab_Research\Sequence_GENCODE\pre_train_splicing_prediction.csv


Scanning: 25325376 rows [03:28, 121466.89 rows/s]



üìä B·∫¢NG TH·ªêNG K√ä D·ªÆ LI·ªÜU (Time: 208.51s)
1. T·ªîNG QUAN:
   - T·ªïng s·ªë d√≤ng: 25,325,376
   - Null rows   : 0  (0.0000%)
   - Duplicate ID: 0

2. VALUE COUNT & CLASS STATISTICS:
   Label      | Count           | Percentage
   ---------- | --------------- | ----------
   0 (Neg)    | 24,828,800      | 98.0392%
   1 (Donor)  | 250,830         | 0.9904%
   2 (Acc)    | 245,746         | 0.9704%
‚úÖ D·ªØ li·ªáu S·∫†CH (Kh√¥ng Null, Kh√¥ng Tr√πng).
