In [1]:
import random
from pathlib import Path
import sys
import argparse # Keep argparse for potential script usage later
import os # Needed for os.makedirs

# --- Configuration ---
# Input Files
SRC_INPUT_FILE = Path("cleaned_parallel_phones.txt")
TGT_INPUT_FILE = Path("cleaned_parallel_hindi.txt")

# Output Directory for Split Files
OUTPUT_DIR = Path("cleaned_data_split") # <<< Name of the directory to save files into

# Output File Prefix (the part before _train/_val/_test)
# Use the stem of one of the input files as a base
OUTPUT_PREFIX = SRC_INPUT_FILE.stem # Will be "cleaned_parallel_phones"

# Split Ratios
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1 # Note: TRAIN + VAL + TEST should equal 1.0

# Random Seed for reproducibility
RANDOM_SEED = 42

# File Encoding
ENCODING = 'utf-8'
# --- End Configuration ---

def split_parallel_data(
    src_file: Path,
    tgt_file: Path,
    output_dir: Path, # <<< Added output directory parameter
    output_prefix: str,
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    test_ratio: float = 0.1,
    seed: int = 42,
    encoding: str = 'utf-8'
):
    """
    Reads two parallel files, shuffles them, splits into train/val/test sets,
    and writes the output files into the specified output directory.
    """
    print(f"--- Starting Data Split ---")
    print(f"Source Input: {src_file}")
    print(f"Target Input: {tgt_file}")
    print(f"Output Directory: {output_dir}") # <<< Print output dir
    print(f"Output Prefix: {output_prefix}")
    print(f"Ratios: Train={train_ratio:.2f}, Val={val_ratio:.2f}, Test={test_ratio:.2f}")

    # --- Validate Inputs ---
    if not src_file.is_file():
        print(f"Error: Source file not found: {src_file}")
        return False
    if not tgt_file.is_file():
        print(f"Error: Target file not found: {tgt_file}")
        return False
    if not (0.0 <= train_ratio <= 1.0 and 0.0 <= val_ratio <= 1.0 and 0.0 <= test_ratio <= 1.0):
         print(f"Error: Ratios must be between 0.0 and 1.0.")
         return False
    if abs((train_ratio + val_ratio + test_ratio) - 1.0) > 1e-9: # Check sum with tolerance
        print(f"Error: Ratios do not sum to 1.0 (sum={train_ratio + val_ratio + test_ratio:.3f})")
        return False

    # --- Create Output Directory --- # <<< New Step
    try:
        print(f"Ensuring output directory exists: {output_dir}")
        output_dir.mkdir(parents=True, exist_ok=True) # Create dir if not exists
    except OSError as e:
        print(f"Error creating output directory {output_dir}: {e}")
        return False
    # --- End New Step ---

    # --- Read Files ---
    try:
        print(f"Reading source file...")
        with open(src_file, 'r', encoding=encoding) as f:
            src_lines = [line.strip() for line in f]

        print(f"Reading target file...")
        with open(tgt_file, 'r', encoding=encoding) as f:
            tgt_lines = [line.strip() for line in f]
    except Exception as e:
        print(f"Error reading input files: {e}")
        return False

    # --- Verify Line Counts ---
    if len(src_lines) != len(tgt_lines):
        print(f"Error: Line count mismatch! Source has {len(src_lines)}, Target has {len(tgt_lines)}")
        return False

    n_total = len(src_lines)
    if n_total == 0:
        print("Error: Input files are empty.")
        return False
    print(f"Total lines read: {n_total}")

    # --- Combine, Shuffle ---
    print("Combining and shuffling data...")
    combined = list(zip(src_lines, tgt_lines))
    random.seed(seed)
    random.shuffle(combined)

    # --- Calculate Split Sizes ---
    n_test = int(n_total * test_ratio)
    n_val = int(n_total * val_ratio)
    if val_ratio > 0 and n_val == 0 and n_total > 1: n_val = 1
    if test_ratio > 0 and n_test == 0 and n_total > 1: n_test = 1
    n_train = n_total - n_val - n_test
    if n_train <= 0 and n_total > 0:
         print("Warning: Calculated train size is zero or negative. Adjusting splits slightly.")
         if n_val + n_test < n_total:
             n_train = 1
             if n_val > 0: n_val -=1
             elif n_test > 0: n_test -=1
         else:
             print("Error: Cannot allocate even one sample for training. Check ratios and total lines.")
             return False

    print(f"Calculated sizes: Train={n_train}, Val={n_val}, Test={n_test}")
    if n_train + n_val + n_test != n_total:
        print(f"Warning: Calculated sizes sum ({n_train + n_val + n_test}) does not match total ({n_total}) due to rounding. Final train size adjusted.")
        n_train = n_total - n_val - n_test

    # --- Slice Data ---
    train_data = combined[:n_train]
    val_data = combined[n_train : n_train + n_val]
    test_data = combined[n_train + n_val :]

    # --- Write Output Files ---
    # Nested function to handle writing, now aware of output_dir
    def write_split_files(data, split_name, base_dir): # <<< Added base_dir
        if not data:
            print(f"Skipping write for '{split_name}' as it has 0 samples.")
            return True

        # Construct filenames within the output directory
        src_filename = f"{output_prefix}_{split_name}_phones.txt"
        tgt_filename = f"{output_prefix}_{split_name}_hindi.txt"
        src_out_path = base_dir / src_filename # <<< Use / operator for paths
        tgt_out_path = base_dir / tgt_filename # <<< Use / operator for paths

        print(f"Writing {len(data)} lines for '{split_name}' split...")
        print(f"  Source: {src_out_path}")
        print(f"  Target: {tgt_out_path}")
        try:
            with open(src_out_path, 'w', encoding=encoding) as fs, \
                 open(tgt_out_path, 'w', encoding=encoding) as ft:
                for src_line, tgt_line in data:
                    fs.write(src_line + '\n')
                    ft.write(tgt_line + '\n')
            return True
        except Exception as e:
            print(f"Error writing {split_name} files: {e}")
            return False

    print("\nWriting output files...")
    success = True
    # Pass the output_dir to the helper function
    if not write_split_files(train_data, "train", output_dir): success = False
    if not write_split_files(val_data, "val", output_dir): success = False
    if not write_split_files(test_data, "test", output_dir): success = False

    if success:
        print(f"\n--- Data Split Successful (Files saved in '{output_dir}') ---")
        return True
    else:
        print("\n--- Data Split Failed ---")
        return False

# --- Main Execution Block (for Notebook) ---
if __name__ == "__main__": # This check allows running directly or importing
    # Use the configuration variables defined at the top
    split_parallel_data(
        src_file=SRC_INPUT_FILE,
        tgt_file=TGT_INPUT_FILE,
        output_dir=OUTPUT_DIR, # <<< Pass the output directory
        output_prefix=OUTPUT_PREFIX,
        train_ratio=TRAIN_RATIO,
        val_ratio=VAL_RATIO,
        test_ratio=TEST_RATIO,
        seed=RANDOM_SEED,
        encoding=ENCODING
    )
    print("\nScript finished.")

--- Starting Data Split ---
Source Input: cleaned_parallel_phones.txt
Target Input: cleaned_parallel_hindi.txt
Output Directory: cleaned_data_split
Output Prefix: cleaned_parallel_phones
Ratios: Train=0.80, Val=0.10, Test=0.10
Ensuring output directory exists: cleaned_data_split
Reading source file...
Reading target file...
Total lines read: 9171
Combining and shuffling data...
Calculated sizes: Train=7337, Val=917, Test=917

Writing output files...
Writing 7337 lines for 'train' split...
  Source: cleaned_data_split\cleaned_parallel_phones_train_phones.txt
  Target: cleaned_data_split\cleaned_parallel_phones_train_hindi.txt
Writing 917 lines for 'val' split...
  Source: cleaned_data_split\cleaned_parallel_phones_val_phones.txt
  Target: cleaned_data_split\cleaned_parallel_phones_val_hindi.txt
Writing 917 lines for 'test' split...
  Source: cleaned_data_split\cleaned_parallel_phones_test_phones.txt
  Target: cleaned_data_split\cleaned_parallel_phones_test_hindi.txt

--- Data Split Succ