In [None]:
# =====================================================
# Data preparation for SpliceFormer (45k Context)
# =====================================================

import pandas as pd
from pyfaidx import Fasta
from Bio.Seq import Seq
import os
import time
from tqdm import tqdm

# ================= USER CONFIG (ONLY CHANGE THIS) =================
# ƒê∆∞·ªùng d·∫´n l∆∞u file output cho SpliceFormer
OUTPUT_DIR = r"D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceFormer\data"
# =================================================================

# ================= FIXED CONFIG =================
GENOME_PATH = r"D:\Homo_sapiens.GRCh38.dna.primary_assembly.fa"
DATA_FOLDER = r"D:\Bio_sequence_Research_AITALAB\train\task1_splicing_prediction\data_preparation\train_val"

# C·∫•u h√¨nh chu·∫©n cho SpliceFormer (deCODE genetics)
MODEL_MAX_LEN = 45000 
CONTEXT = MODEL_MAX_LEN // 2  # 22500 nt m·ªói b√™n

FILES = ["test_1_1_1.csv", "test_2_1_1.csv", "test_4_1_1.csv", "test_10_1_1.csv", "test_data.csv"]
# ================================================

def get_sequence_worker(row, fasta_obj):
    """
    Tr√≠ch xu·∫•t chu·ªói 45,000 bp.
    Splice site s·∫Ω n·∫±m ch√≠nh x√°c t·∫°i index 22500.
    """
    try:
        # C·∫•u tr√∫c ID gi·∫£ ƒë·ªãnh: gene_chr1_1000_+
        parts = row['id'].split('_')
        chrom, pos, strand = parts[1], int(parts[2]), parts[3]

        # Chuy·ªÉn v·ªÅ 0-based index
        # Gi·∫£ s·ª≠ pos trong ID l√† v·ªã tr√≠ b·∫Øt ƒë·∫ßu c·ªßa Intron (Donor) ho·∫∑c Exon (Acceptor)
        actual_pos = pos - 1 

        start = actual_pos - CONTEXT
        end = actual_pos + CONTEXT

        # L·∫•y sequence t·ª´ genome (pyfaidx x·ª≠ l√Ω lazy loading r·∫•t t·ªët)
        # L∆∞u √Ω: pyfaidx x·ª≠ l√Ω bi√™n (negative index) t·ª± ƒë·ªông nh∆∞ng c·∫ßn c·∫©n th·∫≠n
        if start < 0:
            # N·∫øu tr√†n bi√™n tr√°i (√¢m), l·∫•y t·ª´ 0 ƒë·∫øn end, r·ªìi pad th√™m N v√†o ƒë·∫ßu
            seq_part = str(fasta_obj[chrom][0:end]).upper()
            seq = ("N" * abs(start)) + seq_part
        else:
            seq = str(fasta_obj[chrom][start:end]).upper()

        # Ki·ªÉm tra n·∫øu thi·∫øu ·ªü ƒëu√¥i (cu·ªëi nhi·ªÖm s·∫Øc th·ªÉ)
        if len(seq) < MODEL_MAX_LEN:
            seq = seq + ("N" * (MODEL_MAX_LEN - len(seq)))

        # X·ª≠ l√Ω Reverse Complement n·∫øu l√† m·∫°ch √¢m (-)
        if strand == '-':
            seq = str(Seq(seq).reverse_complement())

        assert len(seq) == MODEL_MAX_LEN
        return seq

    except Exception as e:
        print(f"‚ö†Ô∏è Error at {row.get('id', 'unknown')}: {e}")
        return "N" * MODEL_MAX_LEN

def diagnose_splice_sites(df, sample_size=5):
    """
    Ki·ªÉm tra nhanh xem GT/AG c√≥ n·∫±m ƒë√∫ng gi·ªØa chu·ªói 45k kh√¥ng.
    Center index c·ªßa chu·ªói 45000 l√† 22500.
    """
    center = MODEL_MAX_LEN // 2
    print(f"\n{'Type':<10} | {'Window center (Idx 22500)':<30} | Found?")
    print("-" * 65)

    for label, name in [(1, 'Donor'), (2, 'Acceptor')]:
        subset = df[df['Splicing_types'] == label]
        if len(subset) == 0:
            continue

        samples = subset.sample(min(sample_size, len(subset)))
        for _, row in samples.iterrows():
            seq = row['sequence']
            # L·∫•y c·ª≠a s·ªï nh·ªè quanh t√¢m ƒë·ªÉ soi: -2 ƒë·∫øn +4
            window = seq[center-2 : center+4] 
            
            # Donor (1) mong ƒë·ª£i GT, Acceptor (2) mong ƒë·ª£i AG
            target = "GT" if label == 1 else "AG"
            
            # Highlight c·∫∑p nu ·ªü ch√≠nh gi·ªØa [XY]
            display = f"...{window[:2]}[{window[2:4]}]{window[4:]}..."
            
            # Index 22500 v√† 22501 ph·∫£i kh·ªõp target
            found = "‚úÖ" if window[2:4] == target else "‚ùå"

            print(f"{name:<10} | {display:<30} | {found}")

def run():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print(f"[{time.strftime('%H:%M:%S')}] Loading genome (Lazy load)...")
    # sequence_always_upper=True gi√∫p chu·∫©n h√≥a data ngay t·ª´ ƒë·∫ßu
    genome = Fasta(GENOME_PATH, sequence_always_upper=True)

    for file_name in FILES:
        input_path = os.path.join(DATA_FOLDER, file_name)
        output_path = os.path.join(OUTPUT_DIR, file_name)

        if not os.path.exists(input_path):
            print(f"‚ùå Skip {file_name} (not found)")
            continue

        start_time = time.time()
        df = pd.read_csv(input_path)

        # S·∫Øp x·∫øp theo Chromosome v√† Position ƒë·ªÉ ·ªï c·ª©ng ƒë·ªçc tu·∫ßn t·ª± -> TƒÉng t·ªëc ƒë·ªô c·ª±c l·ªõn
        # T·∫°o c·ªôt t·∫°m ƒë·ªÉ sort
        try:
            split_data = df['id'].str.split('_', expand=True)
            df['_tmp_chr'] = split_data[1]
            df['_tmp_pos'] = split_data[2].astype(int)
            df = df.sort_values(['_tmp_chr', '_tmp_pos']).reset_index(drop=True)
        except Exception:
            print("‚ö†Ô∏è Warning: Could not sort by ID. Processing unsorted (slower).")

        print(f"\nüöÄ Processing {file_name} | {len(df)} rows | Length: {MODEL_MAX_LEN}")

        # D√πng List Comprehension + tqdm s·∫Ω nhanh h∆°n apply m·ªôt ch√∫t
        sequences = []
        # Chuy·ªÉn dataframe th√†nh list dict ƒë·ªÉ iterate nhanh h∆°n
        rows = df.to_dict('records')
        
        for row in tqdm(rows, desc="Extracting 45k seq"):
            sequences.append(get_sequence_worker(row, genome))

        df['sequence'] = sequences

        # Validate d·ªØ li·ªáu
        diagnose_splice_sites(df)

        # D·ªçn d·∫πp v√† l∆∞u
        if '_tmp_chr' in df.columns:
            df.drop(columns=['_tmp_chr', '_tmp_pos'], inplace=True)
            
        # L∆∞u CSV
        # L∆∞u √Ω: File CSV s·∫Ω r·∫•t n·∫∑ng do chu·ªói 45k. 
        df.to_csv(output_path, index=False)

        elapsed = time.time() - start_time
        print(f"‚úÖ Saved to: {output_path}")
        print(f"‚ö° Speed: {len(df)/elapsed:.2f} seq/s")

    print("\nüéØ DONE ‚Äì Data ready for SpliceFormer (45k Context)")

# ================= RUN =================
if __name__ == "__main__":
    run()

[23:27:52] Loading genome...

üöÄ Processing test_1_1_1.csv | 26310 rows


Extracting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26310/26310 [00:01<00:00, 19478.59it/s]



Type       | Window (-2:+2)       | Found?
-------------------------------------------------------
Donor      | GG[TG]AG             | ‚ùå
Donor      | AT[GT]GA             | ‚úÖ
Donor      | GG[TA]AC             | ‚ùå
Donor      | GG[TT]AG             | ‚ùå
Donor      | GG[GT]AC             | ‚úÖ
Acceptor   | AG[AA]CC             | ‚ùå
Acceptor   | AA[GG]AG             | ‚ùå
Acceptor   | AG[GT]TG             | ‚ùå
Acceptor   | AA[GT]GC             | ‚ùå
Acceptor   | CA[GC]AA             | ‚ùå
‚úÖ Saved: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceTransformer\data\test_1_1_1.csv
‚ö° Speed: 8509.29 seq/s

üöÄ Processing test_2_1_1.csv | 35132 rows


Extracting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35132/35132 [00:01<00:00, 26232.96it/s]



Type       | Window (-2:+2)       | Found?
-------------------------------------------------------
Donor      | AG[GT]AT             | ‚úÖ
Donor      | AG[GT]AA             | ‚úÖ
Donor      | AA[GT]AA             | ‚úÖ
Donor      | AG[GT]AC             | ‚úÖ
Donor      | AT[GT]AA             | ‚úÖ
Acceptor   | AG[GT]GG             | ‚ùå
Acceptor   | AG[CC]GG             | ‚ùå
Acceptor   | CA[GG]TT             | ‚ùå
Acceptor   | AG[GG]AC             | ‚ùå
Acceptor   | TA[GA]TC             | ‚ùå
‚úÖ Saved: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceTransformer\data\test_2_1_1.csv
‚ö° Speed: 9764.73 seq/s

üöÄ Processing test_4_1_1.csv | 52776 rows


Extracting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 52776/52776 [00:02<00:00, 24348.33it/s]



Type       | Window (-2:+2)       | Found?
-------------------------------------------------------
Donor      | GG[TG]AT             | ‚ùå
Donor      | CT[GT]GA             | ‚úÖ
Donor      | GG[TA]TA             | ‚ùå
Donor      | AA[GT]AC             | ‚úÖ
Donor      | GG[TG]AG             | ‚ùå
Acceptor   | CA[GT]TG             | ‚ùå
Acceptor   | TA[GC]AA             | ‚ùå
Acceptor   | AG[GG]GC             | ‚ùå
Acceptor   | AA[GC]AA             | ‚ùå
Acceptor   | TA[GG]AC             | ‚ùå
‚úÖ Saved: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceTransformer\data\test_4_1_1.csv
‚ö° Speed: 9293.97 seq/s

üöÄ Processing test_10_1_1.csv | 105708 rows


Extracting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105708/105708 [00:04<00:00, 26223.29it/s]



Type       | Window (-2:+2)       | Found?
-------------------------------------------------------
Donor      | CA[GT]GA             | ‚úÖ
Donor      | TG[GT]GA             | ‚úÖ
Donor      | GG[TA]AG             | ‚ùå
Donor      | GG[GT]GG             | ‚úÖ
Donor      | GG[TC]TG             | ‚ùå
Acceptor   | AG[AG]TG             | ‚úÖ
Acceptor   | AA[GG]AA             | ‚ùå
Acceptor   | CA[GA]TC             | ‚ùå
Acceptor   | CA[GA]CT             | ‚ùå
Acceptor   | CA[GG]TG             | ‚ùå
‚úÖ Saved: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceTransformer\data\test_10_1_1.csv
‚ö° Speed: 9681.08 seq/s

üöÄ Processing test_data.csv | 938297 rows


Extracting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938297/938297 [00:36<00:00, 26044.40it/s]



Type       | Window (-2:+2)       | Found?
-------------------------------------------------------
Donor      | GG[TG]TG             | ‚ùå
Donor      | AG[GT]AA             | ‚úÖ
Donor      | AC[GT]AA             | ‚úÖ
Donor      | AG[TT]CT             | ‚ùå
Donor      | GG[TT]CT             | ‚ùå
Acceptor   | TA[GG]GC             | ‚ùå
Acceptor   | CA[GA]TC             | ‚ùå
Acceptor   | TA[GG]GA             | ‚ùå
Acceptor   | CA[GA]AA             | ‚ùå
Acceptor   | AG[AC]AA             | ‚ùå


KeyboardInterrupt: 