In [1]:
import re

def run_verification_tests(parse_function):
    """
    H√†m n√†y nh·∫≠n v√†o h√†m ph√¢n t√≠ch (parse_function) v√† ch·∫°y th·ª≠ nghi·ªám
    v·ªõi c√°c tr∆∞·ªùng h·ª£p HGVS kh√°c nhau.
    """
    test_cases = [
        # --- Tr∆∞·ªùng h·ª£p c∆° b·∫£n ---
        ("c.100+5G>A", 5, "Intron chu·∫©n (+5)"),
        ("c.100-2A>T", -2, "Intron chu·∫©n (-2)"),
        ("c.123G>A", None, "Exon (kh√¥ng c√≥ offset)"),
        
        # --- Tr∆∞·ªùng h·ª£p UTR (V√πng kh√¥ng d·ªãch m√£) ---
        ("c.-10+5G>T", 5, "5' UTR c√≥ offset"), 
        ("c.*100-3del", -3, "3' UTR c√≥ offset"),

        # --- Tr∆∞·ªùng h·ª£p D·ªÖ g√¢y nh·∫ßm l·∫´n (Edge cases) ---
        ("g.123+5G>A", None, "Sai prefix (g. thay v√¨ c.)"),
        ("c.123+5_123+10del", 5, "V√πng Intron l·ªõn (l·∫•y offset ƒë·∫ßu ti√™n)"),
        ("Text lung tung +5", None, "Chu·ªói r√°c"),
        
        # --- Tr∆∞·ªùng h·ª£p NGUY HI·ªÇM (Regex l·ªèng l·∫ªo d·ªÖ ch·∫øt ·ªü ƒë√¢y) ---
        ("c.insertion+5", None, "Ch·ªØ 'insertion' ch·ª©a d·∫•u + n·∫øu kh√¥ng c·∫©n th·∫≠n"), 
        ("c.123A>G+Result", None, "D·∫•u c·ªông n·∫±m ·ªü ph·∫ßn comment/ƒëu√¥i th·ª´a") 
    ]

    print(f"--- ƒêANG KI·ªÇM TRA H√ÄM: {parse_function.__name__} ---")
    passed = 0
    for hgvsc, expected, desc in test_cases:
        result = parse_function(hgvsc)
        is_correct = result == expected
        status = "‚úÖ OK" if is_correct else f"‚ùå SAI (Ra: {result}, K·ª≥ v·ªçng: {expected})"
        if is_correct: passed += 1
        print(f"{status:<35} | Input: {hgvsc:<20} | {desc}")
    
    print(f"-> K·∫øt qu·∫£: {passed}/{len(test_cases)} tr∆∞·ªùng h·ª£p ƒë√∫ng.\n")

In [2]:
def parse_hgvsc_offset_strict(hgvsc_string):
    """H∆∞·ªõng 1: Regex ch·∫∑t ch·∫Ω h∆°n, y√™u c·∫ßu d·∫•u +/- ph·∫£i ƒë·ª©ng sau 1 con s·ªë"""
    if not isinstance(hgvsc_string, str):
        return None
    
    # Regex gi·∫£i th√≠ch:
    # c\.       : B·∫Øt ƒë·∫ßu b·∫±ng c.
    # .*?       : C√°c k√Ω t·ª± ·ªü gi·ªØa
    # (?<=\d|\*) : LOOKBEHIND - K√Ω t·ª± ƒë·ª©ng ngay tr∆∞·ªõc d·∫•u +/- PH·∫¢I l√† s·ªë ho·∫∑c d·∫•u * (cho 3'UTR)
    # ([+-])    : Nh√≥m 1 (D·∫•u)
    # (\d+)     : Nh√≥m 2 (Gi√° tr·ªã Offset)
    match = re.search(r'c\..*?(?<=\d|\*)([+-])(\d+)', hgvsc_string)
    
    if match:
        sign = match.group(1)
        value = int(match.group(2))
        return -value if sign == '-' else value
    return None

In [3]:
def parse_hgvsc_offset_original(hgvsc_string):
    """H∆∞·ªõng 2: H√†m g·ªëc (Regex l·ªèng l·∫ªo)"""
    if not isinstance(hgvsc_string, str):
        return None
    # Regex g·ªëc: Ch·ªâ c·∫ßn t√¨m th·∫•y +/- v√† s·ªë sau c. l√† b·∫Øt
    match = re.search(r'c\..*?([+-])(\d+)', hgvsc_string)
    if match:
        sign = match.group(1)
        value = int(match.group(2))
        return -value if sign == '-' else value
    return None

In [4]:
# Ch·∫°y ki·ªÉm th·ª≠ cho H∆∞·ªõng 2 (C≈©) tr∆∞·ªõc
run_verification_tests(parse_hgvsc_offset_original)

# Ch·∫°y ki·ªÉm th·ª≠ cho H∆∞·ªõng 1 (T·ªëi ∆∞u)
run_verification_tests(parse_hgvsc_offset_strict)

--- ƒêANG KI·ªÇM TRA H√ÄM: parse_hgvsc_offset_original ---
‚úÖ OK                                | Input: c.100+5G>A           | Intron chu·∫©n (+5)
‚úÖ OK                                | Input: c.100-2A>T           | Intron chu·∫©n (-2)
‚úÖ OK                                | Input: c.123G>A             | Exon (kh√¥ng c√≥ offset)
‚ùå SAI (Ra: -10, K·ª≥ v·ªçng: 5)         | Input: c.-10+5G>T           | 5' UTR c√≥ offset
‚úÖ OK                                | Input: c.*100-3del          | 3' UTR c√≥ offset
‚úÖ OK                                | Input: g.123+5G>A           | Sai prefix (g. thay v√¨ c.)
‚úÖ OK                                | Input: c.123+5_123+10del    | V√πng Intron l·ªõn (l·∫•y offset ƒë·∫ßu ti√™n)
‚úÖ OK                                | Input: Text lung tung +5    | Chu·ªói r√°c
‚ùå SAI (Ra: 5, K·ª≥ v·ªçng: None)        | Input: c.insertion+5        | Ch·ªØ 'insertion' ch·ª©a d·∫•u + n·∫øu kh√¥ng c·∫©n th·∫≠n
‚úÖ OK                                | Input: c.123A>G

In [5]:
import re

def parse_hgvsc_offset_strict(hgvsc_string):
    """
    Ph√¢n t√≠ch chu·ªói HGVSc ƒë·ªÉ l·∫•y offset (+/-) v·ªõi ƒë·ªô ch√≠nh x√°c cao.
    
    C∆° ch·∫ø: S·ª≠ d·ª•ng Positive Lookbehind (?<=\d)
    Logic: Ch·ªâ ch·∫•p nh·∫≠n d·∫•u + ho·∫∑c - n·∫øu ngay tr∆∞·ªõc n√≥ l√† m·ªôt con s·ªë.
    ƒêi·ªÅu n√†y ƒë·∫£m b·∫£o d·∫•u ƒë√≥ l√† offset t·ªça ƒë·ªô, kh√¥ng ph·∫£i text r√°c.
    
    Args:
        hgvsc_string (str): Chu·ªói HGVSc (v√≠ d·ª•: 'c.100+5G>A')
    
    Returns:
        int: Gi√° tr·ªã offset (s·ªë √¢m ho·∫∑c d∆∞∆°ng)
        None: N·∫øu kh√¥ng t√¨m th·∫•y offset h·ª£p l·ªá ho·∫∑c input sai.
    """
    if not isinstance(hgvsc_string, str):
        return None
    
    # Regex gi·∫£i th√≠ch:
    # c\.       : B·∫Øt ƒë·∫ßu b·∫±ng 'c.'
    # .*?       : C√°c k√Ω t·ª± b·∫•t k·ª≥ ·ªü gi·ªØa (non-greedy)
    # (?<=\d)   : LOOKBEHIND - Ki·ªÉm tra k√Ω t·ª± ƒë·ª©ng li·ªÅn tr∆∞·ªõc PH·∫¢I l√† m·ªôt ch·ªØ s·ªë
    # ([+-])    : Nh√≥m 1 - B·∫Øt d·∫•u c·ªông ho·∫∑c tr·ª´
    # (\d+)     : Nh√≥m 2 - B·∫Øt d√£y s·ªë offset li·ªÅn sau
    match = re.search(r'c\..*?(?<=\d)([+-])(\d+)', hgvsc_string)
    
    if match:
        sign = match.group(1)
        value = int(match.group(2))
        # Tr·∫£ v·ªÅ s·ªë √¢m n·∫øu d·∫•u l√† '-', ng∆∞·ª£c l·∫°i l√† d∆∞∆°ng
        return -value if sign == '-' else value
        
    return None

# --- PH·∫¶N KI·ªÇM TH·ª¨ (UNIT TEST) ---

def test_parse_function():
    test_cases = [
        # --- NH√ìM 1: TR∆Ø·ªúNG H·ª¢P CHU·∫®N (DONOR/ACCEPTOR) ---
        ("c.100+1G>A", 1, "Donor site (+1) chu·∫©n"),
        ("c.100+2T>C", 2, "Donor site (+2) chu·∫©n"),
        ("c.200-1G>T", -1, "Acceptor site (-1) chu·∫©n"),
        ("c.200-2A>G", -2, "Acceptor site (-2) chu·∫©n"),
        
        # --- NH√ìM 2: UTR & EXON ---
        ("c.-10+5G>T", 5, "5' UTR offset (sau s·ªë 0 c·ªßa -10)"),
        ("c.*100-5A>T", -5, "3' UTR offset (sau s·ªë 0 c·ªßa 100)"),
        ("c.123G>A", None, "Exon (kh√¥ng c√≥ offset)"),
        
        # --- NH√ìM 3: C√ÅC TR∆Ø·ªúNG H·ª¢P G√ÇY NH·∫¶M L·∫™N (QUAN TR·ªåNG) ---
        ("c.123delinsA+T", None, "Chu·ªói r√°c ch·ª©a d·∫•u + nh∆∞ng kh√¥ng sau s·ªë"),
        ("c.variant+comment", None, "Comment ch·ª©a d·∫•u +"),
        ("g.100+5G>A", None, "B·∫Øt ƒë·∫ßu b·∫±ng g. (kh√¥ng ph·∫£i c.)"),
        ("c.100+?G>A", None, "Offset kh√¥ng x√°c ƒë·ªãnh (d·∫•u ?)"),
        
        # --- NH√ìM 4: OFFSET L·ªöN (DEEP INTRON) ---
        ("c.100+1000G>A", 1000, "Deep Intron"),
    ]

    print(f"{'INPUT':<25} | {'K·∫æT QU·∫¢':<10} | {'TR·∫†NG TH√ÅI':<10} | {'M√î T·∫¢'}")
    print("-" * 85)
    
    all_passed = True
    for hgvsc, expected, desc in test_cases:
        result = parse_hgvsc_offset_strict(hgvsc)
        is_correct = (result == expected)
        status = "‚úÖ OK" if is_correct else "‚ùå FAIL"
        if not is_correct: all_passed = False
        
        print(f"{hgvsc:<25} | {str(result):<10} | {status:<10} | {desc}")
    
    print("-" * 85)
    if all_passed:
        print("üéâ T·∫§T C·∫¢ TEST CASE ƒê·ªÄU ƒê√öNG! H√ÄM AN TO√ÄN ƒê·ªÇ S·ª¨ D·ª§NG.")
    else:
        print("‚ö†Ô∏è C√ì L·ªñI X·∫¢Y RA, C·∫¶N KI·ªÇM TRA L·∫†I.")

# Ch·∫°y th·ª≠
if __name__ == "__main__":
    test_parse_function()

INPUT                     | K·∫æT QU·∫¢    | TR·∫†NG TH√ÅI | M√î T·∫¢
-------------------------------------------------------------------------------------
c.100+1G>A                | 1          | ‚úÖ OK       | Donor site (+1) chu·∫©n
c.100+2T>C                | 2          | ‚úÖ OK       | Donor site (+2) chu·∫©n
c.200-1G>T                | -1         | ‚úÖ OK       | Acceptor site (-1) chu·∫©n
c.200-2A>G                | -2         | ‚úÖ OK       | Acceptor site (-2) chu·∫©n
c.-10+5G>T                | 5          | ‚úÖ OK       | 5' UTR offset (sau s·ªë 0 c·ªßa -10)
c.*100-5A>T               | -5         | ‚úÖ OK       | 3' UTR offset (sau s·ªë 0 c·ªßa 100)
c.123G>A                  | None       | ‚úÖ OK       | Exon (kh√¥ng c√≥ offset)
c.123delinsA+T            | None       | ‚úÖ OK       | Chu·ªói r√°c ch·ª©a d·∫•u + nh∆∞ng kh√¥ng sau s·ªë
c.variant+comment         | None       | ‚úÖ OK       | Comment ch·ª©a d·∫•u +
g.100+5G>A                | None       | ‚úÖ OK       | B·∫Øt ƒë·