In [4]:
import pandas as pd
import numpy as np
import os
import json
from a import MaxEntScorer
from metrics import compute_metrics, get_confusion_matrix

# --- C·∫§U H√åNH ---
DATA_DIR = r"D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\MaxEntScan\data"
OUTPUT_DIR = r"D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\MaxEntScan\results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

scorer = MaxEntScorer()

def softmax(x):
    # Tr√°nh tr√†n s·ªë (numerical stability)
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

# H√†m √°p d·ª•ng ng∆∞·ª°ng th·ªß c√¥ng
def apply_threshold_prediction(probs, thresholds):
    """
    probs: Ma tr·∫≠n x√°c su·∫•t (N_samples, N_classes)
    thresholds: List ng∆∞·ª°ng cho t·ª´ng class [th_class0, th_class1, th_class2]
    """
    # 1. L·∫•y l·ªõp c√≥ x√°c su·∫•t cao nh·∫•t ban ƒë·∫ßu
    initial_preds = np.argmax(probs, axis=1)
    
    # 2. L·∫•y gi√° tr·ªã x√°c su·∫•t t∆∞∆°ng ·ª©ng c·ªßa l·ªõp ƒë√≥
    # np.arange(len(probs)) t·∫°o index h√†ng, initial_preds l√† index c·ªôt
    max_confidences = probs[np.arange(len(probs)), initial_preds]
    
    # 3. L·∫•y ng∆∞·ª°ng t∆∞∆°ng ·ª©ng v·ªõi l·ªõp ƒë∆∞·ª£c d·ª± ƒëo√°n
    # V√≠ d·ª•: n·∫øu m√°y ƒëo√°n l·ªõp 1, n√≥ s·∫Ω l·∫•y thresholds[1]
    selected_thresholds = np.array([thresholds[p] for p in initial_preds])
    
    # 4. So s√°nh: N·∫øu x√°c su·∫•t < ng∆∞·ª°ng -> G√°n v·ªÅ class 0 (Non-site)
    # Ng∆∞·ª£c l·∫°i gi·ªØ nguy√™n d·ª± ƒëo√°n ban ƒë·∫ßu
    final_preds = np.where(max_confidences >= selected_thresholds, initial_preds, 0)
    
    return final_preds

if __name__ == "__main__":
    RATIOS = ["1-1-1", "2-1-1", "4-1-1", "10-1-1", "100-1-1"]
    
    # CHI·∫æN THU·∫¨T BIAS
    CLASS_BIAS = np.array([4.5, 1.2, 4.5])

    # --- C·∫§U H√åNH NG∆Ø·ª†NG (B·∫†N CH·ªàNH ·ªû ƒê√ÇY) ---
    # Index 0: Non-site (th∆∞·ªùng ƒë·ªÉ 0.0 v√¨ n√≥ l√† fallback)
    # Index 1: Donor (GT)
    # Index 2: Acceptor (AG)
    # V√≠ d·ª•: [0.0, 0.7, 0.6] nghƒ©a l√†:
    # - N·∫øu ƒëo√°n l√† Donor, x√°c su·∫•t ph·∫£i > 70% m·ªõi ch·ªët, kh√¥ng th√¨ v·ªÅ Non-site
    # - N·∫øu ƒëo√°n l√† Acceptor, x√°c su·∫•t ph·∫£i > 60% m·ªõi ch·ªët.
    CONFIDENCE_THRESHOLDS = [0.0, 0.7, 0.6] 

    for ratio in RATIOS:
        input_file = os.path.join(DATA_DIR, f"maxent_input_{ratio}.csv")
        if not os.path.exists(input_file):
            print(f"‚ùå Kh√¥ng t√¨m th·∫•y file: {input_file}")
            continue
            
        print(f"\nüöÄ Ratio: {ratio} | Bias: {CLASS_BIAS} | Thresholds: {CONFIDENCE_THRESHOLDS}")
        df = pd.read_csv(input_file)
        
        # 1. T√≠nh Scores th√¥
        s5_scores = df['seq_5'].apply(scorer.score5).values
        s3_scores = df['seq_3'].apply(scorer.score3).values
        
        # 2. T·∫°o ma tr·∫≠n ƒëi·ªÉm
        combined_scores = np.stack([np.zeros(len(s5_scores)), s5_scores, s3_scores], axis=1)
        
        # 3. √Åp d·ª•ng Bias
        weighted_logits = combined_scores + CLASS_BIAS
        
        # 4. D·ª± ƒëo√°n (C√ì S·ª¨ D·ª§NG NG∆Ø·ª†NG)
        all_probs = softmax(weighted_logits)
        
        # --- THAY ƒê·ªîI ·ªû ƒê√ÇY: D√πng h√†m apply_threshold_prediction thay v√¨ np.argmax ---
        # all_preds = np.argmax(weighted_logits, axis=1) # (Code c≈©)
        all_preds = apply_threshold_prediction(all_probs, CONFIDENCE_THRESHOLDS)
        
        # 5. T√≠nh Metrics
        y_true = df['label'].values
        metrics_res = compute_metrics(y_true, all_preds, probs=all_probs, k=2)
        metrics_res['confusion_matrix'] = get_confusion_matrix(y_true, all_preds).tolist()
        
        # L∆∞u th√™m th√¥ng tin thresholds v√†o k·∫øt qu·∫£ ƒë·ªÉ ti·ªán theo d√µi
        metrics_res['meta'] = {
            "bias_applied": CLASS_BIAS.tolist(), 
            "thresholds": CONFIDENCE_THRESHOLDS,
            "ratio": ratio
        }

        output_json = os.path.join(OUTPUT_DIR, f"results_{ratio}.json")
        with open(output_json, "w") as f:
            json.dump(metrics_res, f, indent=4)
            
        print(f"‚úÖ K·∫øt qu·∫£ l∆∞u t·∫°i: {output_json}")


üöÄ Ratio: 1-1-1 | Bias: [4.5 1.2 4.5] | Thresholds: [0.0, 0.7, 0.6]
‚úÖ K·∫øt qu·∫£ l∆∞u t·∫°i: D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\MaxEntScan\results\results_1-1-1.json

üöÄ Ratio: 2-1-1 | Bias: [4.5 1.2 4.5] | Thresholds: [0.0, 0.7, 0.6]
‚úÖ K·∫øt qu·∫£ l∆∞u t·∫°i: D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\MaxEntScan\results\results_2-1-1.json

üöÄ Ratio: 4-1-1 | Bias: [4.5 1.2 4.5] | Thresholds: [0.0, 0.7, 0.6]
‚úÖ K·∫øt qu·∫£ l∆∞u t·∫°i: D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\MaxEntScan\results\results_4-1-1.json

üöÄ Ratio: 10-1-1 | Bias: [4.5 1.2 4.5] | Thresholds: [0.0, 0.7, 0.6]
‚úÖ K·∫øt qu·∫£ l∆∞u t·∫°i: D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\MaxEntScan\results\results_10-1-1.json

üöÄ Ratio: 100-1-1 | Bias: [4.5 1.2 4.5] | Thresholds: [0.0, 0.7, 0.6]
‚úÖ K·∫øt qu·∫£ l∆∞u t·∫°i: D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\MaxEntScan\results\results_100-1-1.json
