## Week 3 - Tokenizer

### 1. Load dataset

In [None]:
from torch.utils.data import Dataset
from pathlib import Path
import pandas as pd
from PIL import Image
from typing import Tuple
from torchtune.modules.tokenizers._utils import BaseTokenizer
import torch

class FoodDataset(Dataset):
    def __init__(
            self,
            data_path: str,
            tokenizer: BaseTokenizer,
            transform: torch.nn.Sequential,
        ):
        data_path = Path(data_path)
        self.tokenizer = tokenizer
        self.transform = transform
        
        # Define df
        self.df = pd.read_csv(data_path / 'Food Ingredients and Recipe Dataset with Image Name Mapping.csv')
        # Keep only the title and image name
        self.df = self.df[['Title', 'Image_Name']]
        # Remove rows with invalid 'Image_Name' entries (e.g., '#NAME?')
        self.df = self.df[self.df['Image_Name'] != '#NAME?']
        # Remove nans
        self.df = self.df.dropna() # There are 5 nans xd

        # Define image_path
        self.images_folder = data_path / 'Food Images/Food Images'
        
        print(f'Loaded {len(self.df)} samples')
    
    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        row = self.df.iloc[idx]
        title = row['Title']
        # img_name we have to add .jpg
        img_name = row['Image_Name'] + '.jpg'
        img_path = self.images_folder / img_name
        
        return self.process_image(img_path), self.process_text(title)

    def process_image(self, img_path: Path) -> torch.Tensor:
        image = Image.open(img_path).convert('RGB')
        return self.transform(image)
    
    def process_text(self, text: str) -> torch.Tensor:
        return torch.Tensor(self.tokenizer.encode(text))

### 2. Tokenizers

#### 2.1. Character Tokenizer
Splits text into individual **characters**, including letters, punctuation, and spaces.

Input:  "Hello!" --> Tokens: ['H', 'e', 'l', 'l', 'o', '!']

In [40]:
from typing import List, Dict, Any

class CharacterTokenizer:
    def __init__(self, dataset_titles: List[str] = None, text_max_len: int = 201):
        self.sos_token = '<SOS>'
        self.eos_token = '<EOS>'
        self.pad_token = '<PAD>'

        # Original character list
        self.chars = [
            self.sos_token, self.eos_token, self.pad_token, '\n', ' ', '!', '"', '#', '%', '&', "'", '(', ')', '+', ',',
            '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '9', ':', ';', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
            'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
            'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
            '\x92', '\x96', '\xa0', '®', 'Á', 'É', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
            'ñ', 'ò', 'ó', 'ô', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ō', 'ơ', '̀', '́', '̃', '̉', 'С', 'и', 'к', 'н', 'р', 'ы',
            '\u2009', '–', '—', '‘', '’', '“', '”', '강', '개', '닭', '된', '장', '전', '정', '찌', '파'
        ]

        # Optionally add new characters from dataset
        if dataset_titles:
            extra_chars = set(''.join(dataset_titles)) - set(self.chars)
            self.chars += sorted(extra_chars)

        self.idx2char = {k: v for k, v in enumerate(self.chars)}
        self.char2idx = {v: k for k, v in enumerate(self.chars)}
        self.text_max_len = text_max_len

    def encode(self, text: str, **kwargs: Dict[str, Any]) -> List[int]:
        encoded = [self.char2idx[self.sos_token]] + [
            self.char2idx.get(char, self.char2idx[self.pad_token]) for char in text
        ] + [self.char2idx[self.eos_token]]
        encoded += [self.char2idx[self.pad_token]] * (self.text_max_len - len(encoded))
        return encoded

    def decode(self, tokens: List[int], **kwargs: Dict[str, Any]) -> str:
        tokens = [
            token for token in tokens
            if token not in [
                self.char2idx[self.sos_token],
                self.char2idx[self.eos_token],
                self.char2idx[self.pad_token]
            ]
        ]
        return ''.join([self.idx2char.get(token, '') for token in tokens])

    def batch_decode(self, batch_tokens: List[List[int]], **kwargs: Dict[str, Any]) -> List[str]:
        return [self.decode(tokens) for tokens in batch_tokens]

    def __len__(self):
        return len(self.chars)


#### 2.2. Word Tokenizer
Splits text into **full words**, typically by whitespace and punctuation.

Input: "Hello world!" --> Tokens: ['Hello', 'world']

In [55]:
from typing import List

class WordTokenizer:
    def __init__(self, texts: List[str] = None, text_max_len: int = 50):
        self.sos_token = "<SOS>"
        self.eos_token = "<EOS>"
        self.pad_token = "<PAD>"
        self.unk_token = "<UNK>"
        self.text_max_len = text_max_len

        self.vocab = [self.sos_token, self.eos_token, self.pad_token, self.unk_token]

        if texts:
            self.build_vocab(texts)
        else:
            self.word2idx = {}
            self.idx2word = {}

    def build_vocab(self, texts: List[str]):
        words = set()
        for text in texts:
            tokens = text.split()
            words.update(tokens)
        self.vocab += sorted(words)
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

    def encode(self, text: str) -> List[int]:
        tokens = [self.sos_token] + text.split()[:self.text_max_len - 2] + [self.eos_token]
        token_ids = [
            self.word2idx.get(token, self.word2idx[self.unk_token]) for token in tokens
        ]
        token_ids += [self.word2idx[self.pad_token]] * (self.text_max_len - len(token_ids))
        return token_ids

    def decode(self, tokens: List[int]) -> str:
        return " ".join([
            self.idx2word.get(token, self.unk_token)
            for token in tokens
            if token not in {
                self.word2idx[self.sos_token],
                self.word2idx[self.eos_token],
                self.word2idx[self.pad_token]
            }
        ])

    def batch_decode(self, batch_tokens: List[List[int]]) -> List[str]:
        return [self.decode(tokens) for tokens in batch_tokens]

    def __len__(self):
        return len(self.vocab)

#### 2.3. WordPiece tokenizer (BERT)
Slits words into **subword units**, using a learned vocabulary of common pieces. Used in models like BERT and RoBERTa.

Input: "Unbelievable" --> Tokens: ['un', '##bel', '##iev', '##able']

In [80]:
from transformers import AutoTokenizer
from typing import List

class WordPieceTokenizer:
    def __init__(self, pretrained_model_name="bert-base-cased", text_max_len=50):
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
        self.text_max_len = text_max_len

    def encode(self, text: str) -> List[int]:
        return self.tokenizer.encode(
            text,
            add_special_tokens=True,
            max_length=self.text_max_len,
            truncation=True,
            padding="max_length"
        )

    def decode(self, tokens: List[int]) -> str:
        return self.tokenizer.decode(tokens, skip_special_tokens=True)

    def batch_decode(self, batch_tokens: List[List[int]]) -> List[str]:
        return self.tokenizer.batch_decode(batch_tokens, skip_special_tokens=True)

    def __len__(self):
        return self.tokenizer.vocab_size
    
def clean_decoded_text(text: str) -> str:
    text = text.replace(" - ", "-")        
    text = text.replace(" ’ ", "’")         
    text = " ".join(text.split())           
    return text

### 3. Test tokenizers

#### 3.1. Load dataset

In [81]:
from torchvision.transforms import v2
from torch import nn

DATA_PATH = '/Users/Usuario/Documents/MCV/C5/week3/archive'

tokenizer = CharacterTokenizer()

transform = nn.Sequential(
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Resize((224, 224), antialias=True),
    v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
)
    

dataset = FoodDataset(data_path=DATA_PATH, tokenizer=tokenizer, transform=transform)   

Loaded 13466 samples


In [None]:
import random

# Pull sample titles from the dataset
sample_texts = [dataset.df.iloc[i]["Title"] for i in random.sample(range(len(dataset)), 3)]

# Extract all titles for vocab building
all_titles = dataset.df["Title"].dropna().tolist()

# Initialize tokenizers with full dataset
char_tokenizer = CharacterTokenizer(dataset_titles=all_titles, text_max_len=50)
word_tokenizer = WordTokenizer(texts=all_titles, text_max_len=15)
wordpiece_tokenizer = WordPieceTokenizer(pretrained_model_name="bert-base-cased", text_max_len=15)

for text in sample_texts:
    print(f"\nOriginal Text: {text}")
    
    # Character-level
    encoded_char = char_tokenizer.encode(text)
    print(f"Char Encoded: {encoded_char}")
    print(f"Char Decoded: {char_tokenizer.decode(encoded_char)}")
    
    # Word-level
    encoded_word = word_tokenizer.encode(text)
    print(f"Word Encoded: {encoded_word}")
    print(f"Word Decoded: {word_tokenizer.decode(encoded_word)}")
    
    # WordPiece-level
    encoded_wp = wordpiece_tokenizer.encode(text)
    raw_decoded_wp = wordpiece_tokenizer.decode(encoded_wp)
    cleaned_wp = clean_decoded_text(raw_decoded_wp)
    print(f"WordPiece Encoded: {encoded_wp}")
    print(f"WordPiece Decoded: {cleaned_wp}")


Original Text: Grilled Fennel-Rubbed Triple-Cut Pork Chops
Char Encoded: [0, 35, 72, 63, 66, 66, 59, 58, 4, 34, 59, 68, 68, 59, 66, 15, 46, 75, 56, 56, 59, 58, 4, 48, 72, 63, 70, 66, 59, 15, 31, 75, 74, 4, 44, 69, 72, 65, 4, 31, 62, 69, 70, 73, 1, 2, 2, 2, 2, 2]
Char Decoded: Grilled Fennel-Rubbed Triple-Cut Pork Chops
Word Encoded: [0, 3326, 2808, 7160, 5440, 1864, 1, 2, 2, 2, 2, 2, 2, 2, 2]
Word Decoded: Grilled Fennel-Rubbed Triple-Cut Pork Chops
WordPiece Encoded: [101, 144, 26327, 1181, 27868, 8967, 118, 155, 10354, 4774, 9457, 118, 15411, 18959, 102]
WordPiece Decoded: Grilled Fennel-Rubbed Triple-Cut Po

Original Text: Cumin-Scented Stir-Fried Beef with Celery
Char Encoded: [0, 31, 75, 67, 63, 68, 15, 47, 57, 59, 68, 74, 59, 58, 4, 47, 74, 63, 72, 15, 34, 72, 63, 59, 58, 4, 30, 59, 59, 60, 4, 77, 63, 74, 62, 4, 31, 59, 66, 59, 72, 79, 1, 2, 2, 2, 2, 2, 2, 2]
Char Decoded: Cumin-Scented Stir-Fried Beef with Celery
Word Encoded: [0, 2350, 6645, 857, 7680, 1572, 1, 2, 2, 2, 2, 2, 

## 4. Evaluate

In [83]:
import evaluate
from typing import List
import nltk

class Metric():
    def __init__(self):
        self._download_nltk_resources()
        self.bleu = evaluate.load('bleu')
        self.rouge = evaluate.load('rouge')
        self.meteor = evaluate.load('meteor')
        
    def _download_nltk_resources(self):
        """Download NLTK resources quietly"""
        try:
            nltk.data.find('wordnet')
        except LookupError:
            nltk.download('wordnet', quiet=True)
            
        try:
            nltk.data.find('punkt')
        except LookupError:
            nltk.download('punkt', quiet=True)
            
        try:
            nltk.data.find('omw-1.4')
        except LookupError:
            nltk.download('omw-1.4', quiet=True)

    def compute_metrics(self, ground_truth: List[str], prediction: List[str]):
        res_b = self.bleu.compute(predictions=prediction, references=ground_truth)
        res_r = self.rouge.compute(predictions=prediction, references=ground_truth)
        res_m = self.meteor.compute(predictions=prediction, references=ground_truth)

        return {'bleu': res_b, 'rouge': res_r, 'meteor': res_m}


#### Check largest titles to define text_max_len

In [None]:
# Longest title (by character count)
longest_title = max(dataset.df['Title'].dropna(), key=len)
print(f"Longest Title ({len(longest_title)} chars):\n{longest_title}")

# Longest title (by number of words)
longest_by_words = max(dataset.df['Title'].dropna(), key=lambda x: len(x.split()))
print(f"\nLongest Title ({len(longest_by_words.split())} words):\n{longest_by_words}")


Longest Title (112 chars):
Hummus-Crusted Alaskan Wild King Salmon Over a Bed of French Beans, Red Onion, and Cucumber Salad with Lemon Oil

Longest Title (19 words):
Hummus-Crusted Alaskan Wild King Salmon Over a Bed of French Beans, Red Onion, and Cucumber Salad with Lemon Oil


In [None]:
import random
from metrics import Metric 

sample_texts = [dataset.df.iloc[i]["Title"] for i in random.sample(range(len(dataset)), 30)]
all_titles = dataset.df["Title"].dropna().tolist()

char_tokenizer = CharacterTokenizer(dataset_titles=all_titles, text_max_len=130)
word_tokenizer = WordTokenizer(texts=all_titles, text_max_len=25)
wordpiece_tokenizer = WordPieceTokenizer(pretrained_model_name="bert-base-cased", text_max_len=40)

metric = Metric()

gt = []     
char_preds = []
word_preds = []
wp_preds = []

for text in sample_texts:
    gt.append([text]) 

    # Char
    char_out = char_tokenizer.decode(char_tokenizer.encode(text))
    char_preds.append(char_out)

    # Word
    word_out = word_tokenizer.decode(word_tokenizer.encode(text))
    word_preds.append(word_out)

    # WordPiece
    wp_raw = wordpiece_tokenizer.decode(wordpiece_tokenizer.encode(text))
    wp_clean = clean_decoded_text(wp_raw)
    wp_preds.append(wp_clean)

# -- Compute metrics --
print("\nCharacter Tokenizer Metrics:")
print(metric.compute_metrics(gt, char_preds))

print("\nWord Tokenizer Metrics:")
print(metric.compute_metrics(gt, word_preds))

print("\nWordPiece Tokenizer Metrics:")
print(metric.compute_metrics(gt, wp_preds))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Character Tokenizer Metrics:
{'bleu': {'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 161, 'reference_length': 161}, 'rouge': {'rouge1': 1.0, 'rouge2': 0.9666666666666667, 'rougeL': 1.0, 'rougeLsum': 1.0}, 'meteor': {'meteor': 0.9684720182678573}}

Word Tokenizer Metrics:
{'bleu': {'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 161, 'reference_length': 161}, 'rouge': {'rouge1': 1.0, 'rouge2': 0.9666666666666667, 'rougeL': 1.0, 'rougeLsum': 1.0}, 'meteor': {'meteor': 0.9684720182678573}}

WordPiece Tokenizer Metrics:
{'bleu': {'bleu': 0.9656954349207257, 'precisions': [0.9815950920245399, 0.9699248120300752, 0.9615384615384616, 0.95], 'brevity_penalty': 1.0, 'length_ratio': 1.0124223602484472, 'translation_length': 163, 'reference_length': 161}, 'rouge': {'rouge1': 1.0, 'rouge2': 0.9666666666666667, 'rougeL': 1.0, 'rougeLsum': 1.0}, 'meteor': {'meteo