# Notebook 1: Data Augmentation & Preparation
Purpose:
1. Load AG News training data.
2. Preprocess (tokenize) original data, keeping original text.
3. **Run Cleanlab** to identify noisy labels in the training set.
4. **Save the cleaned, tokenized ORIGINAL data.**
5. Apply efficient, batched Contextual Augmentation to the CLEANED data.
6. Tokenize the augmented text.
7. Cast labels to ClassLabel type.
8. **Save the tokenized AUGMENTED (from cleaned) dataset.**

In [2]:
!rm -rf /kaggle/working/*

In [3]:
import os
import time
import random
import torch
from datasets import load_dataset, Dataset, concatenate_datasets, ClassLabel, load_from_disk
from transformers import RobertaTokenizer, pipeline
import traceback
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
import cleanlab
import numpy as np
import gc

In [4]:
# --- Configuration ---
base_model_name = 'roberta-base'
dataset_name = 'ag_news'
train_split = 'train'

# Output path within Kaggle's writable directory
processed_data_dir = "/kaggle/working/processed_data"
# --- Define TWO separate save paths ---
cleaned_original_save_path = os.path.join(processed_data_dir, "cleaned_tokenized_original_agnews")
tokenized_augmented_save_path = os.path.join(processed_data_dir, "tokenized_augmented_from_cleaned_agnews") # New name

# Create directory if it doesn't exist
os.makedirs(processed_data_dir, exist_ok=True)

# Augmentation settings
AUGMENTATION_MASK_PROBABILITY = 0.2
AUGMENTATION_MAP_BATCH_SIZE = 256
AUGMENTATION_TOP_K = 1

# Tokenization settings
TOKENIZER_MAX_LENGTH = 512

In [5]:
# --- GPU Check and Pipeline Initialization ---
if not torch.cuda.is_available():
    print("WARNING: CUDA (GPU) is not available. Pipeline will run on CPU (will be slow).")
    pipeline_device = -1
else:
    pipeline_device = 0
    print(f"INFO: CUDA available. Initializing pipeline on device {pipeline_device}.")
    print(f"INFO: GPU Name: {torch.cuda.get_device_name(0)}")

# Initialize the fill-mask pipeline
try:
    fill_mask_pipeline = pipeline(
        "fill-mask", model=base_model_name, tokenizer=base_model_name, device=pipeline_device
    )
    print(f"INFO: Initialized fill-mask pipeline on device: {fill_mask_pipeline.device}")
    mask_token = fill_mask_pipeline.tokenizer.mask_token
    print(f"INFO: Mask token: {mask_token}")
except Exception as e:
    print(f"ERROR: Failed to initialize fill-mask pipeline: {e}")
    raise e

INFO: CUDA available. Initializing pipeline on device 0.
INFO: GPU Name: Tesla P100-PCIE-16GB


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


INFO: Initialized fill-mask pipeline on device: cuda:0
INFO: Mask token: <mask>


In [6]:
# --- Load Tokenizer ---
print(f"INFO: Loading tokenizer: {base_model_name}")
try:
    tokenizer = RobertaTokenizer.from_pretrained(base_model_name)
except Exception as e:
    print(f"ERROR: Failed to load tokenizer {base_model_name}: {e}")
    raise e

INFO: Loading tokenizer: roberta-base


In [7]:
# --- Load Original Dataset ---
print(f"INFO: Loading dataset '{dataset_name}', split '{train_split}'...")
try:
    original_dataset = load_dataset(dataset_name, split=train_split)
    print(f"INFO: Original dataset loaded with {len(original_dataset)} examples.")
    print(f"INFO: Original features: {original_dataset.features}")
except Exception as e:
    print(f"ERROR: Failed to load dataset: {e}")
    raise e

INFO: Loading dataset 'ag_news', split 'train'...


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

INFO: Original dataset loaded with 120000 examples.
INFO: Original features: {'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}


In [8]:
# --- Initial Preprocessing (Tokenize + Keep Original Text) ---
def preprocess_initial(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=False, max_length=TOKENIZER_MAX_LENGTH)
    tokenized['orig_text'] = examples['text']
    return tokenized

print("INFO: Starting initial tokenization...")
num_cpus = os.cpu_count()
num_proc_initial = max(1, num_cpus - 2) if num_cpus > 2 else 1
try:
    # This dataset contains all original examples, tokenized + orig_text
    train_dataset_processed = original_dataset.map(
        preprocess_initial, batched=True, num_proc=num_proc_initial, remove_columns=['text']
    )
    if 'label' in train_dataset_processed.column_names:
        train_dataset_processed = train_dataset_processed.rename_column("label", "labels")
    print("INFO: Initial tokenization complete.")
    print(f"INFO: Processed train dataset columns: {train_dataset_processed.column_names}")
except Exception as e:
    print(f"ERROR: Error during initial preprocessing: {e}")
    raise e

try:
    # Store feature info from the full processed dataset before cleaning
    original_labels_feature = train_dataset_processed.features['labels']
    if not isinstance(original_labels_feature, ClassLabel):
        print(f"WARNING: Original 'labels' feature is not ClassLabel: {original_labels_feature}.")
    else:
        print(f"INFO: Stored original ClassLabel features: {original_labels_feature}")
except KeyError:
    print("ERROR: 'labels' column not found after preprocessing.")
    raise KeyError("'labels' column missing.")

INFO: Starting initial tokenization...


Map (num_proc=2):   0%|          | 0/120000 [00:00<?, ? examples/s]

INFO: Initial tokenization complete.
INFO: Processed train dataset columns: ['labels', 'input_ids', 'attention_mask', 'orig_text']
INFO: Stored original ClassLabel features: ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)


In [9]:
# --- Cleanlab: Generate Embeddings ---
print("INFO: Preparing data for Cleanlab...")
try:
    train_texts = train_dataset_processed["orig_text"]
    train_labels_np = np.array(train_dataset_processed["labels"])
    print(f"INFO: Using {len(train_texts)} training texts for cleanlab analysis.")
except KeyError as e:
    print(f"ERROR: Missing 'orig_text' or 'labels' column: {e}")
    raise

print("INFO: Loading Sentence Transformer model for embeddings...")
try:
    sbert_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=sbert_device)
    print(f"INFO: Encoding training texts on {sbert_device}...")
    train_embeddings = sbert_model.encode(train_texts, show_progress_bar=True, batch_size=128)
    print(f"INFO: Generated embeddings of shape: {train_embeddings.shape}")
except Exception as e:
    print(f"ERROR: Failed to generate sentence embeddings: {e}")
    raise

INFO: Preparing data for Cleanlab...
INFO: Using 120000 training texts for cleanlab analysis.
INFO: Loading Sentence Transformer model for embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

INFO: Encoding training texts on cuda...


Batches:   0%|          | 0/938 [00:00<?, ?it/s]

INFO: Generated embeddings of shape: (120000, 384)


In [10]:
# --- Cleanlab: Get Out-of-Sample Predicted Probabilities (on CPU) ---
print("INFO: Getting out-of-sample predicted probabilities via cross-validation...")
try:
    classifier = LogisticRegression(solver='liblinear', random_state=42)
    num_cv_folds = 5
    pred_probs = cross_val_predict(
        classifier, train_embeddings, train_labels_np,
        cv=StratifiedKFold(num_cv_folds, shuffle=True, random_state=42),
        method='predict_proba', n_jobs=num_proc_initial
    )
    print(f"INFO: Generated out-of-sample probabilities of shape: {pred_probs.shape}")
except Exception as e:
    print(f"ERROR: Failed during cross-validation prediction: {e}")
    raise

INFO: Getting out-of-sample predicted probabilities via cross-validation...
INFO: Generated out-of-sample probabilities of shape: (120000, 4)


In [11]:
# --- Cleanlab: Find Label Issues ---
print("INFO: Running cleanlab to find label issues...")
try:
    label_issues_info = cleanlab.filter.find_label_issues(
        labels=train_labels_np, pred_probs=pred_probs, return_indices_ranked_by='self_confidence'
    )
    num_issues_found = len(label_issues_info)
    print(f"INFO: Cleanlab found {num_issues_found} potential label issues.")
    if num_issues_found > 0: print(f"INFO: Example issue indices: {label_issues_info[:10]}...")
    issue_indices = label_issues_info
    all_indices = np.arange(len(train_labels_np))
    indices_to_keep = np.setdiff1d(all_indices, issue_indices, assume_unique=True)
    print(f"INFO: Identified {len(indices_to_keep)} examples with likely clean labels.")
except Exception as e:
    print(f"ERROR: Failed during cleanlab issue finding: {e}")
    print("WARNING: Proceeding without cleaning due to error.")
    indices_to_keep = np.arange(len(train_labels_np))

INFO: Running cleanlab to find label issues...
INFO: Cleanlab found 5168 potential label issues.
INFO: Example issue indices: [ 67035  66633 107558  19150  63316   4307  96061 101562  44984  47030]...
INFO: Identified 114832 examples with likely clean labels.


In [12]:
# --- Filter Dataset ---
print(f"INFO: Creating cleaned dataset by selecting {len(indices_to_keep)} examples...")
# This dataset contains the CLEANED ORIGINAL examples, tokenized, with orig_text
cleaned_train_dataset_processed = train_dataset_processed.select(indices_to_keep)
print(f"INFO: Cleaned training dataset size: {len(cleaned_train_dataset_processed)}")

# Clean up memory from cleanlab step
del train_embeddings, pred_probs, label_issues_info, train_labels_np, train_texts
gc.collect()
if torch.cuda.is_available(): torch.cuda.empty_cache()

INFO: Creating cleaned dataset by selecting 114832 examples...
INFO: Cleaned training dataset size: 114832


In [13]:
print(f"INFO: Saving CLEANED, TOKENIZED ORIGINAL data to {cleaned_original_save_path}...")
try:
    required_columns_save = ['input_ids', 'attention_mask', 'labels']
    if not all(col in cleaned_train_dataset_processed.column_names for col in required_columns_save):
         raise KeyError(f"Cleaned dataset missing columns for saving. Found: {cleaned_train_dataset_processed.column_names}")

    cleaned_to_save = cleaned_train_dataset_processed.select_columns(required_columns_save)
    # Ensure labels feature is correct before saving
    cleaned_to_save = cleaned_to_save.cast_column('labels', original_labels_feature)

    cleaned_to_save.save_to_disk(cleaned_original_save_path)
    print("INFO: Cleaned, tokenized original data saved successfully.")
except Exception as e:
    print(f"ERROR: Error saving cleaned original data: {e}")
    traceback.print_exc()
    raise e

INFO: Saving CLEANED, TOKENIZED ORIGINAL data to /kaggle/working/processed_data/cleaned_tokenized_original_agnews...


Casting the dataset:   0%|          | 0/114832 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/114832 [00:00<?, ? examples/s]

INFO: Cleaned, tokenized original data saved successfully.


In [14]:
# --- Define Optimized Batched Contextual Augmentation Function ---
def augment_contextual_batch_batched(examples):
    """ Performs contextual augmentation efficiently by batching pipeline calls. """
    global fill_mask_pipeline, mask_token
    input_texts = examples["orig_text"]
    batch_size = len(input_texts);
    if batch_size == 0: return {"text_augmented": []}
    masked_texts_info = []; original_texts_for_fallback = {}
    # 1. Prepare masked inputs
    for i, text in enumerate(input_texts):
        original_texts_for_fallback[i] = text; words = text.split()
        if not words: continue
        new_words = words.copy(); masked = False; num_masks = 0; max_masks = 5; indices_masked = []
        word_indices = list(range(len(words))); random.shuffle(word_indices)
        for idx in word_indices:
            if random.random() < AUGMENTATION_MASK_PROBABILITY and num_masks < max_masks:
                 if len(words[idx]) > 1 and not words[idx].startswith('<'):
                    new_words[idx] = mask_token; indices_masked.append(idx); masked = True; num_masks += 1
        if not masked and len(words) > 0:
             attempts = 0
             while attempts < 5:
                 idx = random.randint(0, len(words) - 1)
                 if len(words[idx]) > 1 and not words[idx].startswith('<'):
                     new_words[idx] = mask_token; indices_masked.append(idx); masked = True; num_masks += 1; break
                 attempts += 1
        if masked: masked_text = " ".join(new_words); masked_texts_info.append( (i, masked_text, words, sorted(indices_masked)) )
    if not masked_texts_info: return {"text_augmented": input_texts}
    batch_masked_texts = [info[1] for info in masked_texts_info]
    # 2. Single Pipeline Call
    try: pipeline_results = fill_mask_pipeline(batch_masked_texts, top_k=AUGMENTATION_TOP_K)
    except Exception as e: print(f"ERROR: Fill-Mask Pipeline error: {e}. Returning originals."); return {"text_augmented": input_texts}
    # 3. Process results
    final_augmented_texts = [""] * batch_size; processed_indices = set(info[0] for info in masked_texts_info)
    for i in range(batch_size):
        if i not in processed_indices: final_augmented_texts[i] = original_texts_for_fallback[i]
    if len(pipeline_results) == len(masked_texts_info):
        for result_idx, original_info in enumerate(masked_texts_info):
            original_index, original_words, masked_indices = original_info[0], original_info[2], original_info[3]
            current_predictions = pipeline_results[result_idx]; reconstructed_words = original_words[:]
            if not current_predictions: final_augmented_texts[original_index] = " ".join(reconstructed_words); continue
            num_masks_in_input = len(masked_indices); num_pred_groups = 0
            if isinstance(current_predictions, list) and current_predictions:
                if isinstance(current_predictions[0], list): num_pred_groups = len(current_predictions)
                elif isinstance(current_predictions[0], dict): num_pred_groups = 1
            if num_pred_groups == num_masks_in_input:
                pred_idx = 0
                for target_idx in masked_indices:
                    token_str = ""; prediction_item = None
                    try:
                        if isinstance(current_predictions[0], list):
                            if pred_idx < len(current_predictions) and current_predictions[pred_idx]: prediction_item = current_predictions[pred_idx][0]
                        else:
                             if pred_idx == 0 and current_predictions: prediction_item = current_predictions[0]
                    except IndexError: print(f"WARNING: IndexError accessing prediction {pred_idx}")
                    if prediction_item and isinstance(prediction_item, dict) and 'token_str' in prediction_item: token_str = prediction_item['token_str'].strip()
                    if token_str and not token_str.isspace() and target_idx < len(reconstructed_words): reconstructed_words[target_idx] = token_str
                    pred_idx += 1
                final_augmented_texts[original_index] = " ".join(reconstructed_words)
            else: print(f"WARNING: Mask/pred mismatch item {original_index}. Using original."); final_augmented_texts[original_index] = " ".join(original_words)
    else: print(f"ERROR: Mismatch #masked vs #results. Reverting batch."); final_augmented_texts = [original_texts_for_fallback.get(i, "") for i in range(batch_size)]
    return {"text_augmented": final_augmented_texts}

print("INFO: Batched contextual augmentation function defined.")

INFO: Batched contextual augmentation function defined.


In [15]:
# --- Apply ONLY Batched Contextual Augmentation (ON CLEANED DATA) ---
print("INFO: Starting batched contextual augmentation mapping for CLEANED data (using GPU)...")
start_time_ctx = time.time()
try:
    # Apply augmentation to the CLEANED dataset
    augmented_train_dataset_features = cleaned_train_dataset_processed.map(
        augment_contextual_batch_batched, # The efficient batched function
        batched=True,
        batch_size=AUGMENTATION_MAP_BATCH_SIZE,
        num_proc=1, # Must be 1 for GPU safety
    )
    end_time_ctx = time.time()
    print(f"INFO: Contextual augmentation mapping finished in {end_time_ctx - start_time_ctx:.2f} seconds.")
    print(f"INFO: Columns after map: {augmented_train_dataset_features.column_names}")
except Exception as e:
    print(f"ERROR: Error during contextual augmentation mapping: {e}")
    traceback.print_exc()
    raise e

INFO: Starting batched contextual augmentation mapping for CLEANED data (using GPU)...


Map:   0%|          | 0/114832 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


INFO: Contextual augmentation mapping finished in 1527.38 seconds.
INFO: Columns after map: ['labels', 'input_ids', 'attention_mask', 'orig_text', 'text_augmented']


In [16]:
# --- Create Augmented-Only Dataset ---
# This dataset contains the AUGMENTED text generated from CLEANED originals, plus labels
print("INFO: Creating dataset from augmented text...")
try:
    required_cols_after_map = ['labels', 'text_augmented']
    if not all(col in augmented_train_dataset_features.column_names for col in required_cols_after_map):
         raise KeyError(f"Map output missing required columns. Found: {augmented_train_dataset_features.column_names}")

    augmented_only_dataset = Dataset.from_dict({
        "text": augmented_train_dataset_features["text_augmented"],
        "labels": augmented_train_dataset_features["labels"]
    })
    print(f"INFO: Augmented-only dataset created.")
except KeyError as e:
     print(f"ERROR: Column error creating final augmented dataset: {e}")
     raise e
except Exception as e:
    print(f"ERROR: Error creating dataset from dict: {e}")
    raise e

INFO: Creating dataset from augmented text...
INFO: Augmented-only dataset created.


In [17]:
# --- Tokenize Augmented Text ---
def preprocess_augmented_text(examples):
    return tokenizer(examples['text'], truncation=True, padding=False, max_length=TOKENIZER_MAX_LENGTH)

print("INFO: Tokenizing augmented text dataset...")
start_time_tok = time.time()
num_proc_tokenize = max(1, os.cpu_count() - 2) if os.cpu_count() > 2 else 1
try:
    tokenized_augmented_dataset = augmented_only_dataset.map(
        preprocess_augmented_text, batched=True, remove_columns=["text"], num_proc=num_proc_tokenize
    )
    end_time_tok = time.time()
    print(f"INFO: Tokenization of augmented text complete in {end_time_tok - start_time_tok:.2f} seconds.")
    print(f"INFO: Tokenized augmented dataset columns: {tokenized_augmented_dataset.column_names}")
except Exception as e:
    print(f"ERROR: Error during augmented text tokenization: {e}")
    raise e

INFO: Tokenizing augmented text dataset...


Map (num_proc=2):   0%|          | 0/114832 [00:00<?, ? examples/s]

INFO: Tokenization of augmented text complete in 27.10 seconds.
INFO: Tokenized augmented dataset columns: ['labels', 'input_ids', 'attention_mask']


In [18]:
# --- Cast Labels Column for Augmented Data---
print("INFO: Casting labels column type for augmented data...")
try:
    if 'original_labels_feature' not in globals(): raise NameError("original_labels_feature not defined.")
    tokenized_augmented_dataset = tokenized_augmented_dataset.cast_column(
        'labels', original_labels_feature
    )
    print(f"INFO: Augmented dataset labels feature after cast: {tokenized_augmented_dataset.features['labels']}")
except NameError as e: print(f"ERROR: Original label features variable not found: {e}"); raise
except Exception as e: print(f"ERROR: Error casting labels column: {e}"); raise e

INFO: Casting labels column type for augmented data...


Casting the dataset:   0%|          | 0/114832 [00:00<?, ? examples/s]

INFO: Augmented dataset labels feature after cast: ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)


In [19]:
# --- Save Processed AUGMENTED Dataset ---
print(f"INFO: Saving tokenized AUGMENTED (from cleaned) data to {tokenized_augmented_save_path}...")
try:
    # Ensure final columns are correct before saving
    tokenized_augmented_dataset = tokenized_augmented_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
    tokenized_augmented_dataset.save_to_disk(tokenized_augmented_save_path)
    print("INFO: Tokenized augmented dataset saved successfully.")
    print(f"INFO: Output files should contain TWO directories now:\n"
          f"      1. {cleaned_original_save_path}\n"
          f"      2. {tokenized_augmented_save_path}")
except Exception as e:
    print(f"ERROR: Error saving tokenized augmented dataset to disk: {e}")
    raise e

INFO: Saving tokenized AUGMENTED (from cleaned) data to /kaggle/working/processed_data/tokenized_augmented_from_cleaned_agnews...


Saving the dataset (0/1 shards):   0%|          | 0/114832 [00:00<?, ? examples/s]

INFO: Tokenized augmented dataset saved successfully.
INFO: Output files should contain TWO directories now:
      1. /kaggle/working/processed_data/cleaned_tokenized_original_agnews
      2. /kaggle/working/processed_data/tokenized_augmented_from_cleaned_agnews
