In [1]:
!pip install transformers datasets accelerate sentencepiece
!pip install evaluate scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import numpy as np
import torch

# 1. Load and prepare the dataset
def load_and_prepare_data():
    # Load the dataset
    df = pd.read_csv("hf://datasets/Tobi-Bueck/customer-support-tickets/dataset-tickets-multi-lang-4-20k.csv")

    # Combine subject and body for full ticket text
    df['full_text'] = df['subject'] + ' ' + df['body']

    # Combine all tag columns into a single list of tags
    tag_columns = [f'tag_{i}' for i in range(1, 9)]
    df['all_tags'] = df[tag_columns].apply(
        lambda row: [tag for tag in row if pd.notna(tag)], axis=1
    )

    # Get all unique tags
    all_tags = list(set(tag for tags in df['all_tags'] for tag in tags))

    # Create multi-label encoding
    for tag in all_tags:
        df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)

    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Convert to Hugging Face dataset format
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return train_dataset, test_dataset, all_tags

# 2. Zero-shot classification
def zero_shot_classifier(ticket_text, categories):
    classifier = pipeline("zero-shot-classification",
                        model="facebook/bart-large-mnli")
    result = classifier(ticket_text, categories, multi_label=True)
    return list(zip(result["labels"][:3], result["scores"][:3]))

# 3. Few-shot classification (corrected approach)
def few_shot_classifier(ticket_text, categories, model, tokenizer, top_k=3):
    # Create prompt with examples
    prompt = f"""Classify this support ticket into one or more categories:

Example 1:
Ticket: "I can't log in to my account"
Categories: login, authentication

Example 2:
Ticket: "The app crashes when I open the settings"
Categories: bug, mobile

Example 3:
Ticket: "How do I cancel my subscription?"
Categories: billing, account

Ticket to classify: "{ticket_text}"
Possible categories: {', '.join(categories)}

Return the top {top_k} most relevant categories, comma-separated:"""

    # Tokenize and get predictions
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

    # Get logits for all possible categories
    with torch.no_grad():
        logits = model(**inputs).logits

    # Convert logits to probabilities
    probs = torch.sigmoid(logits)[0]

    # Get top k predictions
    top_probs, top_indices = torch.topk(probs, k=top_k)

    # Map indices to tags
    results = [(categories[idx], prob.item()) for idx, prob in zip(top_indices, top_probs)]

    return results

# 4. Fine-tuned model approach
def train_fine_tuned_model(train_dataset, test_dataset, categories):
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(examples):
        return tokenizer(examples["full_text"], padding="max_length", truncation=True)

    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(categories),
        problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
    )

    trainer.train()
    return model, tokenizer

# Main execution
def main():
    # Load data
    train_data, test_data, all_tags = load_and_prepare_data()

    # Example ticket
    sample_ticket = train_data[0]['full_text']
    print(f"\nSample ticket text: {sample_ticket[:200]}...\n")
    print(f"Actual tags: {train_data[0]['all_tags']}\n")

    print("=== Zero-Shot Classification ===")
    zero_shot_results = zero_shot_classifier(sample_ticket, all_tags)
    print("Top 3 tags:", zero_shot_results)

    print("\n=== Few-Shot Classification ===")
    # Initialize a fresh model for few-shot (not fine-tuned yet)
    few_shot_model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=len(all_tags),
        problem_type="multi_label_classification"
    )
    few_shot_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    few_shot_results = few_shot_classifier(
        sample_ticket,
        all_tags,
        few_shot_model,
        few_shot_tokenizer
    )
    print("Top 3 tags:", few_shot_results)

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  df[tag] = df['all_tags'].apply(lambda x: 1 if tag in x else 0)
  d


Sample ticket text: Advice on Integrating External Tools Customer Support, I am inquiring about integrating third-party tools into our project management SaaS. Could you provide some guidance on this matter? I would grea...

Actual tags: ['Feature', 'Documentation', 'Feedback', 'Tech Support']

=== Zero-Shot Classification ===


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Top 3 tags: [('Customer Support', 0.9990885257720947), ('Customer Service', 0.996067464351654), ('SaaS', 0.995919406414032)]

=== Few-Shot Classification ===


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Top 3 tags: [('MongoDB4.4', 0.5770582556724548), ('Nuendo', 0.5690120458602905), ('Promotion,rabatte,Diienstleistung', 0.5670577883720398)]


In [29]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
    """Compute metrics for multi-label classification"""
    logits, labels = eval_pred
    predictions = (logits > 0).astype(float)  # Threshold at 0 for multi-label

    # Calculate metrics
    f1 = f1_score(labels, predictions, average="micro", zero_division=0)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average="micro", zero_division=0)
    recall = recall_score(labels, predictions, average="micro", zero_division=0)

    return {
        "f1": f1,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall
    }

def train_fine_tuned_model(train_dataset, test_dataset, all_tags):
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(batch):
        texts = [str(text) for text in batch["full_text"]]
        tokenized = tokenizer(
            texts,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        labels = [[1.0 if tag in tags else 0.0 for tag in all_tags] for tags in batch["all_tags"]]
        tokenized["labels"] = torch.tensor(labels, dtype=torch.float32)
        return tokenized

    tokenized_train = train_dataset.map(tokenize_function, batched=True, batch_size=32)
    tokenized_test = test_dataset.map(tokenize_function, batched=True, batch_size=32)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(all_tags),
        problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
        output_dir="./fine_tuned_model",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",  # Now matches our computed metric
        greater_is_better=True,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics,  # Add our metrics function
    )

    trainer.train()
    trainer.save_model("./fine_tuned_model/best_model")
    tokenizer.save_pretrained("./fine_tuned_model/best_model")

    return model, tokenizer

# Example usage:
# Load data
train_data, test_data, all_tags = load_and_prepare_data()
trained_model, trained_tokenizer = train_fine_tuned_model(train_data, test_data, all_tags)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.0118,0.010493,0.387164,0.021,0.697605,0.267932
2,0.0091,0.008361,0.564732,0.05225,0.775898,0.443917
3,0.0084,0.007989,0.584736,0.05325,0.789315,0.464376


In [36]:
def test_multiple_examples(model, tokenizer, all_tags, examples=None):
    """Test multiple support ticket examples with the trained model"""
    if examples is None:
        examples = [
            "I can't login to my account - getting error messages",
            "The mobile app crashes when I try to view my profile",
            "How do I reset my password?",
            "Payment failed but my card was charged",
            "Feature request: dark mode for the dashboard",
            "The website is very slow today"
        ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    print("\n" + "="*50)
    print("Testing Multiple Support Tickets")
    print("="*50)

    for text in examples:
        # Tokenize and move to device
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Predict
        with torch.no_grad():
            outputs = model(**inputs)

        # Get probabilities and top 3 tags
        probs = torch.sigmoid(outputs.logits)[0].cpu()
        top_probs, top_indices = torch.topk(probs, 3)

        # Format output
        print(f"\nTicket: '{text}'")
        print("Predicted Tags:")
        for i, (idx, prob) in enumerate(zip(top_indices, top_probs), 1):
            print(f"{i}. {all_tags[idx]} ({prob:.1%})")
        print("-"*50)



# Test multiple examples
test_multiple_examples(trained_model, trained_tokenizer, all_tags)

# You can also test with your own examples:
custom_examples = [
        "Where can I download my invoice?",
        "The search function returns wrong results",
        "Two-factor authentication not working"
]
test_multiple_examples(trained_model, trained_tokenizer, all_tags, custom_examples)


Testing Multiple Support Tickets

Ticket: 'I can't login to my account - getting error messages'
Predicted Tags:
1. Billing (61.0%)
2. Resolution (41.6%)
3. Payment (31.6%)
--------------------------------------------------

Ticket: 'The mobile app crashes when I try to view my profile'
Predicted Tags:
1. Technical (84.4%)
2. Resolution (47.8%)
3. Bug (44.3%)
--------------------------------------------------

Ticket: 'How do I reset my password?'
Predicted Tags:
1. Documentation (58.8%)
2. Technical (50.9%)
3. Security (45.1%)
--------------------------------------------------

Ticket: 'Payment failed but my card was charged'
Predicted Tags:
1. Billing (67.4%)
2. Payment (41.6%)
3. Feedback (23.9%)
--------------------------------------------------

Ticket: 'Feature request: dark mode for the dashboard'
Predicted Tags:
1. Technical (50.0%)
2. Performance (40.6%)
3. Documentation (39.4%)
--------------------------------------------------

Ticket: 'The website is very slow today'
Predi