# 02 - Data Preprocessing

This notebook prepares the SoQG dataset for FLAN-T5 fine-tuning.

## Objectives
- Clean and deduplicate the dataset
- Format inputs with instruction-style prompts
- Tokenize with T5Tokenizer
- Create train/val/test splits
- Save processed datasets to disk

## Input Format for FLAN-T5
```
Generate a Socratic question: {question_type}: {context}
```

## Target Format
```
{socratic_question}
```

## 1. Setup and Configuration

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import T5Tokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm

SEED = 42
np.random.seed(SEED)

RAW_DIR = Path("../datasets/raw/soqg/data/soqg_dataset")
PROCESSED_DIR = Path("../datasets/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 128
MODEL_NAME = "google/flan-t5-base"

TASK_PREFIX = "Generate a Socratic question"

## 2. Load Raw Data

In [None]:
train_I = pd.read_csv(RAW_DIR / "train_chunk_I.csv", index_col=0)
train_II = pd.read_csv(RAW_DIR / "train_chunk_II.csv", index_col=0)
train_III = pd.read_csv(RAW_DIR / "train_chunk_III.csv", index_col=0)

train_df = pd.concat([train_I, train_II, train_III], axis=0, ignore_index=True)
val_df = pd.read_csv(RAW_DIR / "val.csv", index_col=0)
test_df = pd.read_csv(RAW_DIR / "test.csv", index_col=0)

print(f"Raw sizes - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

## 3. Text Cleaning Functions

In [None]:
def clean_text(text):
    """Normalize whitespace and remove special characters."""
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def extract_parts(input_text):
    """Extract question type and context from input."""
    if ':' not in input_text:
        return None, input_text
    parts = input_text.split(':', 1)
    q_type = parts[0].strip().lower()
    context = parts[1].strip() if len(parts) > 1 else ""
    return q_type, context

def format_input(question_type, context):
    """Format input for FLAN-T5 instruction tuning."""
    return f"{TASK_PREFIX}: {question_type}: {context}"

VALID_TYPES = {
    'clarification',
    'assumptions', 
    'reasons_evidence',
    'implication_consequences',
    'alternate_viewpoints_perspectives'
}

## 4. Apply Cleaning Pipeline

In [None]:
def process_dataframe(df, name="dataset"):
    """Apply full cleaning pipeline to a dataframe."""
    df = df.copy()
    original_len = len(df)
    
    df['input'] = df['input'].apply(clean_text)
    df['target'] = df['target'].apply(clean_text)
    
    df = df[df['input'].str.len() > 0]
    df = df[df['target'].str.len() > 0]
    
    df[['question_type', 'context']] = df['input'].apply(
        lambda x: pd.Series(extract_parts(x))
    )
    
    df = df[df['question_type'].isin(VALID_TYPES)]
    
    df = df[df['context'].str.split().str.len() >= 10]
    df = df[df['target'].str.split().str.len() >= 3]
    
    df = df.drop_duplicates(subset=['context', 'target'])
    
    df['formatted_input'] = df.apply(
        lambda row: format_input(row['question_type'], row['context']), axis=1
    )
    
    print(f"{name}: {original_len} -> {len(df)} ({100*len(df)/original_len:.1f}% retained)")
    return df

train_clean = process_dataframe(train_df, "Train")
val_clean = process_dataframe(val_df, "Val")
test_clean = process_dataframe(test_df, "Test")

## 5. Verify No Data Leakage

Critical: Ensure no overlap between train/val/test sets.

In [None]:
train_contexts = set(train_clean['context'].tolist())
val_contexts = set(val_clean['context'].tolist())
test_contexts = set(test_clean['context'].tolist())

train_val_overlap = len(train_contexts & val_contexts)
train_test_overlap = len(train_contexts & test_contexts)
val_test_overlap = len(val_contexts & test_contexts)

print(f"Train-Val overlap: {train_val_overlap}")
print(f"Train-Test overlap: {train_test_overlap}")
print(f"Val-Test overlap: {val_test_overlap}")

if train_val_overlap > 0 or train_test_overlap > 0:
    print("\n⚠️ WARNING: Data leakage detected! Removing overlapping samples...")
    val_clean = val_clean[~val_clean['context'].isin(train_contexts)]
    test_clean = test_clean[~test_clean['context'].isin(train_contexts)]
    print(f"After removal - Val: {len(val_clean)}, Test: {len(test_clean)}")

## 6. Initialize Tokenizer

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")
print(f"Model max length: {tokenizer.model_max_length}")

## 7. Analyze Token Lengths

Verify our max lengths (512 source, 128 target) are appropriate.

In [None]:
sample = train_clean.sample(min(1000, len(train_clean)), random_state=SEED)

input_lengths = []
target_lengths = []

for _, row in tqdm(sample.iterrows(), total=len(sample), desc="Tokenizing samples"):
    input_tokens = tokenizer(row['formatted_input'], truncation=False)
    target_tokens = tokenizer(row['target'], truncation=False)
    input_lengths.append(len(input_tokens['input_ids']))
    target_lengths.append(len(target_tokens['input_ids']))

print(f"\nInput token lengths:")
print(f"  Mean: {np.mean(input_lengths):.1f}")
print(f"  Median: {np.median(input_lengths):.1f}")
print(f"  95th percentile: {np.percentile(input_lengths, 95):.1f}")
print(f"  Max: {np.max(input_lengths)}")
print(f"  % over {MAX_SOURCE_LENGTH}: {100*np.mean(np.array(input_lengths) > MAX_SOURCE_LENGTH):.1f}%")

print(f"\nTarget token lengths:")
print(f"  Mean: {np.mean(target_lengths):.1f}")
print(f"  Median: {np.median(target_lengths):.1f}")
print(f"  95th percentile: {np.percentile(target_lengths, 95):.1f}")
print(f"  Max: {np.max(target_lengths)}")
print(f"  % over {MAX_TARGET_LENGTH}: {100*np.mean(np.array(target_lengths) > MAX_TARGET_LENGTH):.1f}%")

## 8. Create Tokenization Function

In [None]:
def tokenize_function(examples):
    """Tokenize inputs and targets for Seq2Seq training."""
    model_inputs = tokenizer(
        examples['formatted_input'],
        max_length=MAX_SOURCE_LENGTH,
        truncation=True,
        padding='max_length'
    )
    
    labels = tokenizer(
        examples['target'],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding='max_length'
    )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

## 9. Convert to HuggingFace Datasets

In [None]:
train_data = train_clean[['formatted_input', 'target', 'question_type']].reset_index(drop=True)
val_data = val_clean[['formatted_input', 'target', 'question_type']].reset_index(drop=True)
test_data = test_clean[['formatted_input', 'target', 'question_type']].reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

print(f"Train: {len(train_dataset)}")
print(f"Val: {len(val_dataset)}")
print(f"Test: {len(test_dataset)}")

## 10. Apply Tokenization

In [None]:
train_tokenized = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['formatted_input', 'target'],
    desc="Tokenizing train"
)

val_tokenized = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['formatted_input', 'target'],
    desc="Tokenizing val"
)

test_tokenized = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['formatted_input', 'target'],
    desc="Tokenizing test"
)

print(f"\nTokenized features: {train_tokenized.features}")

## 11. Verify Tokenization

In [None]:
sample_idx = 0
sample_input_ids = train_tokenized[sample_idx]['input_ids']
sample_labels = train_tokenized[sample_idx]['labels']

print("Original input:")
print(train_data.iloc[sample_idx]['formatted_input'][:200], "...")
print("\nDecoded input:")
print(tokenizer.decode(sample_input_ids, skip_special_tokens=True)[:200], "...")
print("\nOriginal target:")
print(train_data.iloc[sample_idx]['target'])
print("\nDecoded target:")
print(tokenizer.decode(sample_labels, skip_special_tokens=True))

## 12. Save Processed Datasets

In [None]:
dataset_dict = DatasetDict({
    'train': train_tokenized,
    'validation': val_tokenized,
    'test': test_tokenized
})

dataset_dict.save_to_disk(PROCESSED_DIR / "soqg_tokenized")
print(f"Saved tokenized dataset to {PROCESSED_DIR / 'soqg_tokenized'}")

In [None]:
train_clean.to_csv(PROCESSED_DIR / "train_clean.csv", index=False)
val_clean.to_csv(PROCESSED_DIR / "val_clean.csv", index=False)
test_clean.to_csv(PROCESSED_DIR / "test_clean.csv", index=False)
print("Saved clean CSVs for reference.")

## 13. Save Question Type Distribution

In [None]:
type_dist = train_clean['question_type'].value_counts()
type_dist.to_csv(PROCESSED_DIR / "question_type_distribution.csv")
print("Question type distribution:")
print(type_dist)

## 14. Generate Preprocessing Report

In [None]:
report = f"""
# Preprocessing Report

## Configuration
- Model: {MODEL_NAME}
- Max source length: {MAX_SOURCE_LENGTH}
- Max target length: {MAX_TARGET_LENGTH}
- Random seed: {SEED}

## Dataset Sizes
| Split | Raw | Clean | Retention |
|-------|-----|-------|----------|
| Train | {len(train_df)} | {len(train_clean)} | {100*len(train_clean)/len(train_df):.1f}% |
| Val | {len(val_df)} | {len(val_clean)} | {100*len(val_clean)/len(val_df):.1f}% |
| Test | {len(test_df)} | {len(test_clean)} | {100*len(test_clean)/len(test_df):.1f}% |

## Question Type Distribution (Train)
{type_dist.to_markdown()}

## Cleaning Steps Applied
1. Removed empty inputs/targets
2. Filtered invalid question types
3. Removed contexts < 10 words
4. Removed questions < 3 words
5. Deduplicated by (context, target) pair
6. Verified no train/test overlap

## Output Files
- `soqg_tokenized/`: HuggingFace Dataset (ready for training)
- `train_clean.csv`, `val_clean.csv`, `test_clean.csv`: Clean CSVs
- `question_type_distribution.csv`: Type counts
"""

with open(PROCESSED_DIR / "preprocessing_report.md", "w") as f:
    f.write(report)

print(report)

## 15. Next Steps

The preprocessed data is ready for training. Proceed to:
1. **03_training.ipynb** - Fine-tune FLAN-T5 on the tokenized dataset
2. Use `datasets.load_from_disk("../datasets/processed/soqg_tokenized")` to load