# Data Split: Train / Dev / Test

================================================================================
PURPOSE: Split dataset into Train/Dev/Test sets to prevent data leakage
================================================================================

This notebook performs the initial data split that is critical for preventing
data leakage in the experimental pipeline. The dataset is divided into three
non-overlapping sets:

- **Train**: Used for training models (70% of data)
- **Dev**: Used for model selection, feature selection, and hyperparameter
  tuning (15% of data)
- **Test**: **ONLY** used in final evaluation notebook (15% of data). This set
  is never used for training, model selection, or any development decisions.

**CRITICAL**: The test set is separated FIRST and will ONLY be accessed in the
final evaluation notebook (05_final_evaluation.ipynb). This ensures fair and
unbiased evaluation according to competition rules.


In [None]:
# ============================================================================
# SETUP: Repository Clone, Drive Mount, and Path Configuration
# ============================================================================
# This cell performs minimal setup required for the notebook to run:
# 1. Clones repository from GitHub (if not already present)
# 2. Mounts Google Drive for persistent data storage
# 3. Configures Python paths and initializes StorageManager

import shutil
import os
import subprocess
import time
import requests
import zipfile
import sys
from pathlib import Path
from google.colab import drive

# Repository configuration
repo_dir = '/content/semeval-context-tree-modular'
repo_url = 'https://github.com/EonTechie/semeval-context-tree-modular.git'
zip_url = 'https://github.com/EonTechie/semeval-context-tree-modular/archive/refs/heads/main.zip'

# Clone repository (if not already present)
if not os.path.exists(repo_dir):
    print("Cloning repository from GitHub...")
    max_retries = 2
    clone_success = False
    
    for attempt in range(max_retries):
        try:
            result = subprocess.run(
                ['git', 'clone', repo_url],
                cwd='/content',
                capture_output=True,
                text=True,
                timeout=60
            )
            if result.returncode == 0:
                print("Repository cloned successfully via git")
                clone_success = True
                break
            else:
                if attempt < max_retries - 1:
                    time.sleep(3)
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(3)
    
    # Fallback: Download as ZIP if git clone fails
    if not clone_success:
        print("Git clone failed. Downloading repository as ZIP archive...")
        zip_path = '/tmp/repo.zip'
        try:
            response = requests.get(zip_url, stream=True, timeout=60)
            response.raise_for_status()
            with open(zip_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall('/content')
            extracted_dir = '/content/semeval-context-tree-modular-main'
            if os.path.exists(extracted_dir):
                os.rename(extracted_dir, repo_dir)
            os.remove(zip_path)
            print("Repository downloaded and extracted successfully")
        except Exception as e:
            raise RuntimeError(f"Failed to obtain repository: {e}")

# Mount Google Drive (if not already mounted)
try:
    drive.mount('/content/drive', force_remount=False)
except Exception:
    pass  # Already mounted

# Configure paths
BASE_PATH = Path('/content/semeval-context-tree-modular')
DATA_PATH = Path('/content/drive/MyDrive/semeval_data')
sys.path.insert(0, str(BASE_PATH))

# Initialize StorageManager
from src.storage.manager import StorageManager
storage = StorageManager(
    base_path=str(BASE_PATH),
    data_path=str(DATA_PATH),
    github_path=str(BASE_PATH)
)

print("Setup complete")
print(f"  Repository: {BASE_PATH}")
print(f"  Data storage: {DATA_PATH}")


In [None]:
# ============================================================================
# LOAD DATASET FROM HUGGINGFACE HUB
# ============================================================================
# Loads the QEvasion dataset from HuggingFace Hub
# The dataset contains question-answer pairs with clarity and evasion labels

from src.data.loader import load_dataset

dataset = load_dataset(dataset_name="ailsntua/QEvasion")
train_raw = dataset['train']

print(f"Dataset loaded: {len(train_raw)} samples")
print(f"Dataset features: {list(train_raw.features.keys())}")


In [None]:
# ============================================================================
# SPLIT DATASET INTO TRAIN / DEV / TEST
# ============================================================================
# Performs stratified split to maintain label distribution across splits
# Test set is separated FIRST and will ONLY be used in final evaluation
# This ensures no data leakage during model development

from src.data.splitter import split_dataset

train_ds, dev_ds, test_ds = split_dataset(
    dataset=train_raw,
    test_ratio=0.15,  # 15% reserved for final test evaluation
    dev_ratio=0.15,   # 15% for development (model/feature selection)
    seed=42            # Fixed seed for reproducibility
)

print("Dataset split completed:")
print(f"  Train: {len(train_ds)} samples ({len(train_ds)/len(train_raw)*100:.1f}%)")
print(f"  Dev: {len(dev_ds)} samples ({len(dev_ds)/len(train_raw)*100:.1f}%)")
print(f"  Test: {len(test_ds)} samples ({len(test_ds)/len(train_raw)*100:.1f}%)")


In [None]:
# ============================================================================
# SAVE SPLITS TO PERSISTENT STORAGE
# ============================================================================
# Saves the three splits to Google Drive for use in subsequent notebooks
# Splits are saved in a format that preserves all dataset features and metadata

storage.save_splits(train_ds, dev_ds, test_ds)

print("Splits saved to persistent storage")
print(f"  Train: {len(train_ds)} samples")
print(f"  Dev: {len(dev_ds)} samples")
print(f"  Test: {len(test_ds)} samples")
print("\nIMPORTANT: Test set will ONLY be used in final evaluation notebook")
print("           (05_final_evaluation.ipynb). Do not use it for development!")
