# Time to slice and dice

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

This notebook demonstrates advanced dataset manipulation techniques including filtering, mapping, and processing with the Datasets library.

In [None]:
# Install required packages for dataset manipulation
# Note: Using uv pip for faster package installation
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Download and extract a dataset for drug reviews analysis
# This dataset contains patient reviews for various medications
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [None]:
# Load CSV files using the datasets library
# Key parameters:
# - "csv": specifies CSV format loader
# - data_files: dictionary mapping split names to file paths
# - delimiter="\t": specifies tab-separated values
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [None]:
# Create a smaller sample for faster experimentation
# shuffle() randomizes the order, select() takes the first N examples
# This is useful for testing code on large datasets
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Print the first few examples of the dataset
drug_sample[:3]

In [None]:
# Data validation: Check for unique patient IDs
# This ensures each row represents a unique review
# assert will raise an error if the condition is false
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [None]:
# Rename column for better readability
# "Unnamed: 0" is not descriptive - "patient_id" is clearer
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

In [None]:
# Define a function to convert condition names to lowercase
# This will help standardize the data for better analysis
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

# Attempt to apply the function - this will fail due to None values
drug_dataset.map(lowercase_condition)

In [None]:
# Define a filter function to remove rows with missing condition data
# This prevents errors when processing the data
def filter_nones(x):
    return x["condition"] is not None

In [None]:
# Demonstration of lambda functions
# Lambda functions are anonymous functions for simple operations
# Example 1: Square a number
(lambda x: x * x)(3)

In [None]:
# Example 2: Calculate triangle area using multiple parameters
# Lambda functions can take multiple arguments
(lambda base, height: 0.5 * base * height)(4, 8)

In [None]:
# Apply filter to remove rows with None condition values
# This uses a lambda function inline for the filtering operation
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [None]:
# Now apply lowercase transformation to condition names
# map() applies the function to every example in the dataset
drug_dataset = drug_dataset.map(lowercase_condition)
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

In [None]:
# Add a new feature: review length in words
# This creates a new column with computed values
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [None]:
# Apply the review length computation to all examples
# The new column will be added to the dataset automatically
drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

In [None]:
# Sort dataset by review length to find shortest reviews
# This helps identify data quality issues or outliers
drug_dataset["train"].sort("review_length")[:3]

In [None]:
# Filter out very short reviews (less than 30 words)
# Short reviews typically don't contain enough information for analysis
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

In [None]:
# Demonstrate HTML entity decoding
# Drug reviews often contain HTML entities that need to be converted
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

In [None]:
# Apply HTML entity decoding to all reviews
# This cleans up the text data for better processing
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

In [None]:
# Alternative: Apply HTML decoding in batched mode for better performance
# batched=True processes multiple examples at once, improving speed
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

In [None]:
# Set up tokenization for the reviews
# Using BERT tokenizer to convert text to tokens for model training
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [None]:
# Apply tokenization to the entire dataset using batched processing
# %time measures execution time - fast tokenizers are much quicker
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

In [None]:
# Compare with slow tokenizer using multiprocessing
# Demonstrates the speed advantage of fast tokenizers
# num_proc=8 uses 8 CPU cores for parallel processing
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)

def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)

tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

In [None]:
# Handle long texts with token splitting
# return_overflowing_tokens=True splits long texts into multiple chunks
# max_length=128 sets the maximum sequence length
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [None]:
# Test the tokenization function on a single example
# Shows how long texts are split into multiple sequences
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

In [None]:
# Apply tokenization to dataset - this will fail due to mismatched lengths
# When texts are split, the number of tokens doesn't match original examples
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
# Solution: Remove original columns when tokenizing
# This prevents length mismatch errors but loses original data
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

In [None]:
# Check the effect of tokenization on dataset size
# Dataset grows because long texts are split into multiple examples
len(tokenized_dataset["train"]), len(drug_dataset["train"])

In [None]:
# Better solution: Preserve original data while handling token overflow
# overflow_to_sample_mapping tracks which original example each token chunk came from
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
# Apply the improved tokenization function
# This preserves all original data while handling long sequences
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

In [None]:
# Convert dataset to pandas format for analysis
# This enables use of pandas operations on the dataset
drug_dataset.set_format("pandas")

In [None]:
# Access data as pandas DataFrame for analysis
# This returns the data in pandas format for statistical operations
drug_dataset["train"][:3]

In [None]:
# Extract full training set as pandas DataFrame
# This enables full pandas functionality for data analysis
train_df = drug_dataset["train"][:]

In [None]:
# Analyze condition frequency distribution
# value_counts() shows how often each condition appears
# This helps understand the class distribution in the dataset
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

In [None]:
# Convert pandas DataFrame back to Dataset
# This enables us to use Datasets library features again
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

In [None]:
# Reset format back to standard Dataset format
# This removes the pandas formatting applied earlier
drug_dataset.reset_format()

In [None]:
# Create proper train/validation/test splits
# 80% for training, 20% for validation, plus separate test set
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

In [None]:
# Save the processed dataset to disk for future use
# This creates a local cache that can be reloaded quickly
drug_dataset_clean.save_to_disk("drug-reviews")

In [None]:
# Reload the dataset from disk to verify saving worked
# This demonstrates how to load previously saved datasets
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

In [None]:
# Export each split to JSONL format for sharing or external use
# JSONL (JSON Lines) format is convenient for streaming and processing
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

In [None]:
# Inspect the exported JSONL file format
# Each line contains one JSON object representing a single example
!head -n 1 drug-reviews-train.jsonl

In [None]:
# Reload dataset from JSONL files to verify export worked
# This shows how to load datasets from the exported format
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)