# 02 — Data Preprocessing

FLAN-T5 (Wei et al., 2022) is fine-tuned using instruction-style prompts that state the task in natural language. Each training example is cast into the format:

```
Input:  "Generate a Socratic question for this context: {question_type}: {context}"
Target: "[Question] {question text}"
```

The question-type prefix (e.g., `reasons_evidence:`) is carried verbatim from the SocratiQ input column and acts as a conditioning signal for the decoder. A custom `[Question]` token is added to the tokenizer vocabulary to serve as a decoder start cue, following Ang et al. (2023, §3.1).

Padding positions in the label sequence are set to -100 so the cross-entropy loss ignores them during training.

**Important:** All T5-family models share the same SentencePiece vocabulary, so this tokenized dataset is used by all three training configurations (FLAN-T5-small, FLAN-T5-base, T5-base).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_ROOT = "/content/drive/MyDrive/socratic-path"
os.makedirs(DRIVE_ROOT, exist_ok=True)
print(f"Google Drive mounted at: {DRIVE_ROOT}")

In [None]:
%pip install -q transformers datasets sentencepiece

In [None]:
import re
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.auto import tqdm
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

In [None]:
clone_target = Path(DRIVE_ROOT) / "datasets/raw/soqg"

if not (clone_target / "data").exists():
    print(f"Cloning SocratiQ dataset to {clone_target} ...")
    result = subprocess.run(
        ["git", "clone", "https://github.com/NUS-IDS/eacl23_soqg.git", str(clone_target)],
        capture_output=True, text=True
    )
    if result.returncode == 0:
        print("Clone complete.")
    else:
        print(f"Clone error: {result.stderr}")
else:
    print(f"Dataset already present at {clone_target}")

In [None]:
DATA_DIR = Path(DRIVE_ROOT) / "datasets/raw/soqg/data/soqg_dataset"
OUTPUT_DIR = Path(DRIVE_ROOT) / "datasets/processed"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TOKENIZER_NAME = "google/flan-t5-base"
MAX_SOURCE_LENGTH = 400
MAX_TARGET_LENGTH = 80

INSTRUCTION_PREFIX = "Generate a Socratic question for this context: "
TARGET_PREFIX = "[Question] "

RANDOM_SEED = 42

print(f"Data directory:   {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

In [None]:
train_files = ["train_chunk_I.csv", "train_chunk_II.csv", "train_chunk_III.csv"]
train_chunks = [pd.read_csv(DATA_DIR / f, index_col=0) for f in train_files if (DATA_DIR / f).exists()]
train_df = pd.concat(train_chunks, axis=0, ignore_index=True)

valid_df = pd.read_csv(DATA_DIR / "valid.csv", index_col=0)
test_df  = pd.read_csv(DATA_DIR / "test.csv",  index_col=0)

print(f"Raw sizes -> Train: {len(train_df)}, Valid: {len(valid_df)}, Test: {len(test_df)}")

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

for split_name, df in [("train", train_df), ("valid", valid_df), ("test", test_df)]:
    df['input']  = df['input'].apply(clean_text)
    df['target'] = df['target'].apply(clean_text)
    print(f"Cleaned {split_name}: {len(df)} rows")

In [None]:
def filter_empty_rows(df, split_name):
    before = len(df)
    df = df[df['input'].str.len() > 10].copy()
    df = df[df['target'].str.len() > 5].copy()
    df = df.drop_duplicates(subset=['input', 'target']).reset_index(drop=True)
    after = len(df)
    print(f"{split_name}: {before} -> {after} ({before - after} removed)")
    return df

train_df = filter_empty_rows(train_df, "Train")
valid_df = filter_empty_rows(valid_df, "Valid")
test_df  = filter_empty_rows(test_df,  "Test")

## Input Formatting

Each example is reformatted as an instruction prompt. The `INSTRUCTION_PREFIX` prepends the task description; the `TARGET_PREFIX` adds the `[Question]` sentinel token to the start of every target sequence, matching the training convention in Ang et al. (2023).

In [None]:
def format_for_flan_t5(row):
    formatted_input  = INSTRUCTION_PREFIX + row['input']
    formatted_target = TARGET_PREFIX + row['target']
    return pd.Series({
        'input_text':      formatted_input,
        'target_text':     formatted_target,
        'original_input':  row['input'],
        'original_target': row['target']
    })

train_formatted = train_df.apply(format_for_flan_t5, axis=1)
valid_formatted = valid_df.apply(format_for_flan_t5, axis=1)
test_formatted  = test_df.apply(format_for_flan_t5, axis=1)

print(f"Formatted -> Train: {len(train_formatted)}, Valid: {len(valid_formatted)}, Test: {len(test_formatted)}")

In [None]:
train_dataset = Dataset.from_pandas(train_formatted[['input_text', 'target_text']])
valid_dataset = Dataset.from_pandas(valid_formatted[['input_text', 'target_text']])
test_dataset  = Dataset.from_pandas(test_formatted[['input_text', 'target_text']])

print('HuggingFace datasets created:')
print(f'  Train: {len(train_dataset):,}')
print(f'  Valid: {len(valid_dataset):,}')
print(f'  Test:  {len(test_dataset):,}')

## Tokenizer Setup

The `[Question]` token is added to the tokenizer vocabulary (extending it from 32,100 to 32,101 tokens). All T5-family models (FLAN-T5-small, FLAN-T5-base, T5-base) share the same SentencePiece vocabulary, so this single tokenizer is reused across all training configurations.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
tokenizer.add_tokens(["[Question]"])

print(f"Vocabulary size: {len(tokenizer)}")
print(f"[Question] token ID: {tokenizer.convert_tokens_to_ids('[Question]')}")

In [None]:
def count_tokens(text):
    return len(tokenizer.encode(text, add_special_tokens=True))

sample_size = min(5000, len(train_formatted))
sample_df = train_formatted.sample(sample_size, random_state=RANDOM_SEED).copy()

sample_df['input_tokens']  = sample_df['input_text'].apply(count_tokens)
sample_df['target_tokens'] = sample_df['target_text'].apply(count_tokens)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(sample_df['input_tokens'], bins=50, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].axvline(MAX_SOURCE_LENGTH, color='red', linestyle='--', linewidth=2, label=f'Max: {MAX_SOURCE_LENGTH}')
axes[0].set_title('Input Token Distribution')
axes[0].set_xlabel('Token Count')
axes[0].set_ylabel('Frequency')
axes[0].legend()

axes[1].hist(sample_df['target_tokens'], bins=30, color='coral', alpha=0.7, edgecolor='black')
axes[1].axvline(MAX_TARGET_LENGTH, color='red', linestyle='--', linewidth=2, label=f'Max: {MAX_TARGET_LENGTH}')
axes[1].set_title('Target Token Distribution')
axes[1].set_xlabel('Token Count')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.show()

truncated_inputs  = (sample_df['input_tokens'] > MAX_SOURCE_LENGTH).sum()
truncated_targets = (sample_df['target_tokens'] > MAX_TARGET_LENGTH).sum()
print(f"\nInputs exceeding {MAX_SOURCE_LENGTH} tokens: {truncated_inputs} ({truncated_inputs/sample_size*100:.1f}%)")
print(f"Targets exceeding {MAX_TARGET_LENGTH} tokens: {truncated_targets} ({truncated_targets/sample_size*100:.1f}%)")

In [None]:
LABEL_PAD_TOKEN_ID = -100

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=MAX_SOURCE_LENGTH,
        truncation=True,
        padding=False,
    )

    labels = tokenizer(
        examples["target_text"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding=False,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_tokenized = train_dataset.map(
    tokenize_function, batched=True,
    remove_columns=['input_text', 'target_text'],
    desc="Tokenizing train"
)

valid_tokenized = valid_dataset.map(
    tokenize_function, batched=True,
    remove_columns=['input_text', 'target_text'],
    desc="Tokenizing valid"
)

test_tokenized = test_dataset.map(
    tokenize_function, batched=True,
    remove_columns=['input_text', 'target_text'],
    desc="Tokenizing test"
)

print("Tokenized dataset features:")
print(train_tokenized.features)

In [None]:
dataset_path = OUTPUT_DIR / "soqg_tokenized"

tokenized_dataset = DatasetDict({
    'train':      train_tokenized,
    'validation': valid_tokenized,
    'test':       test_tokenized
})
tokenized_dataset.save_to_disk(str(dataset_path))
print(f"Tokenized dataset saved to: {dataset_path}")

In [None]:
train_formatted.to_parquet(OUTPUT_DIR / "train_formatted.parquet")
valid_formatted.to_parquet(OUTPUT_DIR / "valid_formatted.parquet")
test_formatted.to_parquet(OUTPUT_DIR / "test_formatted.parquet")
print("Saved formatted DataFrames as Parquet files.")

tokenizer.save_pretrained(OUTPUT_DIR / "tokenizer")
print(f"Saved tokenizer with [Question] token to: {OUTPUT_DIR / 'tokenizer'}")

In [None]:
from datasets import load_from_disk

loaded_dataset = load_from_disk(str(dataset_path))
print("Loaded dataset splits:", loaded_dataset)

sample = loaded_dataset['train'][0]
print(f"\nSample input_ids length: {len(sample['input_ids'])}")
print(f"Sample labels length:    {len(sample['labels'])}")

decoded_input  = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
decoded_target = tokenizer.decode(sample['labels'],    skip_special_tokens=True)

print("\nDecoded input:")
print(decoded_input[:200] + "...")
print("\nDecoded target:")
print(decoded_target)

## Summary

The preprocessing pipeline produces four artefacts:

1. **Tokenized `DatasetDict`** (`soqg_tokenized/`) — variable-length sequences (no padding), ready for training with `DataCollatorForSeq2Seq` which handles dynamic padding at batch time.
2. **Formatted Parquet files** — human-readable text for inspection and evaluation scripts.
3. **Extended tokenizer** (vocab size 32,101 including `[Question]`) — shared across all model configurations.
4. **Data quality report** — token distributions and truncation statistics.