### 1. Tokenization & Preprocessing
Q: Write a function to tokenize a given text using a pre-trained BERT tokenizer and return the input IDs and attention mask.

Expected Follow-ups:
  - How does padding and truncation work in tokenization?
  - Modify the function to support batch processing.

In [1]:
# WordPiece Tokenization: BERT uses WordPiece tokenization, which splits words into subwords. 
# For example, the word "clockwork" can be split into "clock" and "##work"3. 
# The ## indicates that the token is a subword and not a complete word.

from transformers import BertTokenizer

def tokenize_text(text, model_name="bert-base-uncased"):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    return inputs["input_ids"], inputs["attention_mask"]

# Example
text = "Transformers are powerful for NLP tasks!"
tokens, masks = tokenize_text(text)
print(tokens.shape)
print(masks.shape)

torch.Size([1, 128])
torch.Size([1, 128])


In [2]:
tokens

tensor([[  101, 19081,  2024,  3928,  2005, 17953,  2361,  8518,   999,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

### 2. BERT EMbeddings

In [None]:
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertModel.from_pretrained("bert-base-cased")

custom_text = "
You are welcome to utilize any text of your choice."
encoded_input = tokenizer(custom_text, return_tensors='tf')
output_embeddings = model(encoded_input)

- If you want token-level embeddings (each word's contextual representation), use: 
- token_embeddings = output_embeddings.last_hidden_state  # Shape: (1, seq_len, 768)
----
- If you need a sentence-level embedding (single vector representation), use:
- sentence_embedding = output_embeddings.pooler_output  # Shape: (1, 768)

# 2. Finetune BERT for Text Classification

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [20]:
# Source : https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification
# Code : https://www.kaggle.com/code/sugataghosh/e-commerce-text-classification-tf-idf-word2vec#Text-Normalization 

data = pd.read_csv('data/ecommerceDataset.csv', names = ['category','description'], header = None)
# print(data['category'].value_counts())

data.dropna(inplace = True) # Dropping observations with missing values
data.drop_duplicates(inplace = True) # Dropping duplicate observations
data.reset_index(drop = True, inplace = True) # Resetting index

# Manual encoding of labels
label_dict = {'Electronics': 0, 'Household': 1, 'Books': 2, 'Clothing & Accessories': 3}
data = data.replace({'category': label_dict})

print(data.shape)

# Feature-target split
X, y = data['description'].values, data['category'].values
# Train-test split (from complete data)
X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y.tolist(), test_size = 0.2, random_state = 40)

data.head()

(27802, 2)


  data = data.replace({'category': label_dict})


Unnamed: 0,category,description
0,1,Paper Plane Design Framed Wall Hanging Motivat...
1,1,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,1,SAF 'UV Textured Modern Art Print Framed' Pain...
3,1,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,1,Incredible Gifts India Wooden Happy Birthday U...


In [21]:
from datasets import Dataset
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Tokenize datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/22241 [00:00<?, ? examples/s]

Map:   0%|          | 0/5561 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

for param in model.distilbert.parameters():
    param.requires_grad = False  # Freeze all DistilBERT layers

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    # save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    # weight_decay=0.01,
    load_best_model_at_end=True,)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Train model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6186,0.495248


TrainOutput(global_step=1391, training_loss=0.7375717656720379, metrics={'train_runtime': 705.5358, 'train_samples_per_second': 31.524, 'train_steps_per_second': 1.972, 'total_flos': 2946312496361472.0, 'train_loss': 0.7375717656720379, 'epoch': 1.0})

In [10]:
# Prediction
import torch

# Load model and move to GPU if available
device = 'cpu'
model = AutoModelForSequenceClassification.from_pretrained("./results/checkpoint-1391").to(device).eval()

def predict(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        return torch.argmax(logits, dim=-1).cpu().numpy()

In [11]:
predict(['hello, electrocics TV'])

array([2])

# 4. Finetune T5 LLM for Text Classification (LoRA - PEFT)

In [2]:
# Source : https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification
# Code : https://www.kaggle.com/code/sugataghosh/e-commerce-text-classification-tf-idf-word2vec#Text-Normalization 

data = pd.read_csv('data/ecommerceDataset.csv', names = ['category','description'], header = None)
# print(data['category'].value_counts())

data.dropna(inplace = True) # Dropping observations with missing values
data.drop_duplicates(inplace = True) # Dropping duplicate observations
data.reset_index(drop = True, inplace = True) # Resetting index

# Manual encoding of labels
# label_dict = {'Electronics': 0, 'Household': 1, 'Books': 2, 'Clothing & Accessories': 3}
# data = data.replace({'category': label_dict})

print(data.shape)

# Feature-target split
X, y = data['description'].values, data['category'].values
# Train-test split (from complete data)
X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y.tolist(), test_size = 0.2, random_state = 40)

data.head()

(27802, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [3]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType

In [4]:
from datasets import Dataset
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

In [11]:
# Define a preprocessing function
def preprocess_function(examples):
    inputs = ["classify category: " + text for text in examples["text"]]
    targets = examples["label"]
    return {"input_texts": inputs, "target_texts": targets}

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
# Tokenize dataset
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_texts"], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(examples["target_texts"], padding="max_length", truncation=True, max_length=8)

    # Replace padding tokens (1) with -100 for loss calculation in T5
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in seq] for seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/22241 [00:00<?, ? examples/s]

Map:   0%|          | 0/5561 [00:00<?, ? examples/s]

Map:   0%|          | 0/22241 [00:00<?, ? examples/s]

Map:   0%|          | 0/5561 [00:00<?, ? examples/s]

In [14]:
# Load the base model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # For sequence-to-sequence tasks
    inference_mode=False, 
    r=8,  # Low-rank parameter
    lora_alpha=16,
    lora_dropout=0.1
)

# Wrap model with LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


In [16]:
# train_dataset[0]

In [17]:
training_args = TrainingArguments(
    output_dir="./t5-sentiment-lora",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

ValueError: too many dimensions 'str'

In [None]:
def predict_sentiment(text):
    input_text = "classify category: " + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        output = model.generate(**inputs)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test the model
print(predict_sentiment("This movie was amazing!"))
print(predict_sentiment("The food was terrible."))