In [1]:
import pandas as pd
import numpy as np
import os

# ==============================================================================
# 1. C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N (S·ª≠a l·∫°i cho ƒë√∫ng input c·ªßa b·∫°n)
# ==============================================================================
INPUT_PATHS = {
    "ViMedNLI": "/kaggle/input/nli-halu/NLI/ViMedNLI/csv_processed/",
    "ViANLI":   "/kaggle/input/nli-halu/NLI/ViANLI/",
    "ViNLI":    "/kaggle/input/nli-halu/NLI/ViNLI/csv/"
}

OUTPUT_DIR = "./processed_data_3labels"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==============================================================================
# 2. H√ÄM CHUY·ªÇN ƒê·ªîI NH√ÉN (CORE LOGIC - 3 LABELS)
# ==============================================================================
def standardize_label(val):
    """
    H√†m chu·∫©n h√≥a nh√£n v·ªÅ 3 lo·∫°i theo ƒë√∫ng slide b√°o c√°o:
    - Entailment (0)    -> Entailment
    - Contradiction (1) -> Intrinsic-Hal
    - Neutral (2)       -> Extrinsic-Hal
    """
    # Chuy·ªÉn v·ªÅ string ch·ªØ th∆∞·ªùng v√† x√≥a kho·∫£ng tr·∫Øng
    s = str(val).lower().strip()
    
    # --- Nh√≥m 1: Entailment (Tin c·∫≠y) ---
    if s == '0' or s == '0.0' or 'entailment' in s:
        return "Entailment"
    
    # --- Nh√≥m 2: Contradiction (M√¢u thu·∫´n n·ªôi t·∫°i) ---
    # L∆∞u √Ω: Trong dataset g·ªëc th∆∞·ªùng 2 l√† Contradiction
    if s == '1' or s == '1.0' or 'contradiction' in s:
        return "Intrinsic-Hal"
    
    # --- Nh√≥m 3: Neutral (B·ªãa ƒë·∫∑t/Ngo·∫°i lai) ---
    # L∆∞u √Ω: Trong dataset g·ªëc th∆∞·ªùng 1 l√† Neutral
    if s == '2' or s == '2.0' or 'neutral' in s:
        return "Extrinsic-Hal"
    
    # Tr∆∞·ªùng h·ª£p l·ªói/kh√¥ng x√°c ƒë·ªãnh
    return "Unknown"

# ==============================================================================
# 3. H√ÄM X·ª¨ L√ù FILE
# ==============================================================================
def process_csv(dataset_name, file_type):
    input_file = os.path.join(INPUT_PATHS[dataset_name], f"{file_type}.csv")
    
    if not os.path.exists(input_file):
        print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y file: {input_file}")
        return

    print(f"üîÑ ƒêang x·ª≠ l√Ω {dataset_name} - {file_type}...")
    
    # 1. ƒê·ªçc d·ªØ li·ªáu
    df = pd.read_csv(input_file)
    
    # 2. L·ªçc b·ªè d·ªØ li·ªáu r√°c
    initial_len = len(df)
    df.dropna(subset=['sentence1', 'sentence2', 'label'], inplace=True)
    
    # 3. √Åp d·ª•ng logic chuy·ªÉn nh√£n (Vectorized)
    vectorized_label_func = np.vectorize(standardize_label)
    df['label'] = vectorized_label_func(df['label'])
    
    # 4. Ki·ªÉm tra v√† lo·∫°i b·ªè c√°c d√≤ng Unknown (n·∫øu c√≥)
    unknown_count = len(df[df['label'] == 'Unknown'])
    if unknown_count > 0:
        print(f"   ‚ö†Ô∏è C·∫£nh b√°o: C√≥ {unknown_count} d√≤ng nh√£n l·∫°, ƒë√£ lo·∫°i b·ªè.")
        df = df[df['label'] != 'Unknown']

    # 5. L∆∞u file m·ªõi
    output_subdir = os.path.join(OUTPUT_DIR, dataset_name)
    os.makedirs(output_subdir, exist_ok=True)
    
    output_file = os.path.join(output_subdir, f"{file_type}.csv")
    df.to_csv(output_file, index=False)
    
    print(f"‚úÖ Xong! Gi·ªØ l·∫°i {len(df)}/{initial_len} d√≤ng. L∆∞u t·∫°i: {output_file}")
    print(f"   Ph√¢n b·ªë nh√£n: {df['label'].value_counts().to_dict()}\n")

# ==============================================================================
# 4. CH·∫†Y X·ª¨ L√ù CHO T·∫§T C·∫¢
# ==============================================================================

datasets = ["ViMedNLI", "ViANLI", "ViNLI"]
file_types = ["train", "dev", "test"]

for ds in datasets:
    for f_type in file_types:
        process_csv(ds, f_type)

print(f"\nüéâ ƒê√£ x·ª≠ l√Ω xong to√†n b·ªô! D·ªØ li·ªáu m·ªõi n·∫±m trong th∆∞ m·ª•c: {OUTPUT_DIR}")

üîÑ ƒêang x·ª≠ l√Ω ViMedNLI - train...
‚úÖ Xong! Gi·ªØ l·∫°i 11232/11232 d√≤ng. L∆∞u t·∫°i: ./processed_data_3labels/ViMedNLI/train.csv
   Ph√¢n b·ªë nh√£n: {'Entailment': 3744, 'Intrinsic-Hal': 3744, 'Extrinsic-Hal': 3744}

üîÑ ƒêang x·ª≠ l√Ω ViMedNLI - dev...
‚úÖ Xong! Gi·ªØ l·∫°i 1395/1395 d√≤ng. L∆∞u t·∫°i: ./processed_data_3labels/ViMedNLI/dev.csv
   Ph√¢n b·ªë nh√£n: {'Entailment': 465, 'Intrinsic-Hal': 465, 'Extrinsic-Hal': 465}

üîÑ ƒêang x·ª≠ l√Ω ViMedNLI - test...
‚úÖ Xong! Gi·ªØ l·∫°i 1422/1422 d√≤ng. L∆∞u t·∫°i: ./processed_data_3labels/ViMedNLI/test.csv
   Ph√¢n b·ªë nh√£n: {'Entailment': 474, 'Intrinsic-Hal': 474, 'Extrinsic-Hal': 474}

üîÑ ƒêang x·ª≠ l√Ω ViANLI - train...
‚úÖ Xong! Gi·ªØ l·∫°i 8012/8012 d√≤ng. L∆∞u t·∫°i: ./processed_data_3labels/ViANLI/train.csv
   Ph√¢n b·ªë nh√£n: {'Extrinsic-Hal': 2924, 'Entailment': 2615, 'Intrinsic-Hal': 2473}

üîÑ ƒêang x·ª≠ l√Ω ViANLI - dev...
‚úÖ Xong! Gi·ªØ l·∫°i 1000/1000 d√≤ng. L∆∞u t·∫°i: ./processed_data_3labels/ViANLI