# AI Workshop - Lab 2-2: Intent Classification

In this lab, we'll build a model to classify customer intents from text messages. We'll use the Hugging Face Transformers library to fine-tune a pre-trained model on a dataset of customer messages and intents.

In [None]:
!pip install -Uq datasets transformers accelerate evaluate

In [None]:
from datasets import load_dataset

In [None]:
intents = load_dataset("parquet", data_files={"train": "data/customer_intent_train.parquet", "test": "data/customer_intent_test.parquet"})

In [None]:
intents['train']

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id="alexwaolson/customer-intents", repo_type="dataset")

In [None]:
intents.push_to_hub("alexwaolson/customer-intents")

In [None]:
from collections import Counter

Counter(intents['train']['label'])

In [None]:
from transformers import AutoTokenizer

# Load our tokenizer
model_name = 't5-small'
# The AutoTokenizer class will automatically select the correct tokenizer class for the model!
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer('Hello, how are you?')

In [None]:
def show_tokenization(tokenizer, text):
    print(f'Original text: {text}')
    tokens = tokenizer(text, truncation=True)['input_ids']
    for token in tokens:
        print(f'{tokenizer.decode([token]):10} -> {token}')

# Write any sentence and see how it gets tokenized by the model:
show_tokenization(tokenizer, 'your sentence here')

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["message"], truncation=True)

In [None]:
tokenized_intents = intents.map(preprocess_function, batched=True)

In [None]:
tokenized_intents['train']

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    # Unpack the predictions and labels
    predictions, labels = eval_pred.predictions, eval_pred.label_ids

    # Handle tuple predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]  # Take the first element, assuming it's the logits

    # Convert to NumPy array if necessary
    predictions = np.array(predictions)

    # Compute class predictions
    predictions = np.argmax(predictions, axis=1)

    # Return computed metrics
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
# Convert labels to integers
label2id = {label: i for i, label in enumerate(intents['train'].unique('label'))}
id2label = {i: label for label, i in label2id.items()}

def encode_label(example):
    example['label'] = label2id[example['label']]
    return example

tokenized_intents = tokenized_intents.map(encode_label)

In [None]:
tokenized_intents['train']

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "t5-small",
    num_labels=27,
    id2label=id2label,
    label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='logs',
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=10,
    save_steps=10,
    output_dir='model',
    overwrite_output_dir=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_intents['train'],
    eval_dataset=tokenized_intents['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()