In [1]:

!pip install transformers datasets torch scikit-learn pandas


Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-1


# 1. Load Dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Correct path in Kaggle
file_path = "/kaggle/input/tickets/all_tickets_processed_improved_v3.csv"

# Load dataset
data = pd.read_csv(file_path) #convert dataset to dataframe 

# Train/test split 80% train 20% test
X_train, X_test, y_train, y_test = train_test_split(
    data['Document'], data['Topic_group'],
    test_size=0.2, random_state=42 #fixes the “random split” so it’s always the same rows in train/test every time you run.
)

# Encode labels : the transformer model doesn’t understand text labels
le = LabelEncoder() # map strings ↔ numbers
y_train_enc = le.fit_transform(y_train) # ["Billing", "Technical", "Account", "Billing"] --> [1, 2, 0, 1]
y_test_enc = le.transform(y_test)

# Documents → vectorized (feature matrix)

# Labels → encoded (target array)

print("Train size:", len(X_train))
print("Test size:", len(X_test))
print("Example X_train:", X_train.iloc[0])
print("Example y_train:", y_train_enc[0])



Train size: 38269
Test size: 9568
Example X_train: suspect message do not open block user tuesday november user tuesday november dear find attached revert regards
Example y_train: 0


# 2. Tokenizer & Model


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(le.classes_)
)

# Tokenize
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

2025-10-02 14:19:39.986275: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759414780.306713      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759414780.401390      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3. PyTorch Dataset

In [4]:
import torch

class TicketDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = TicketDataset(train_encodings, list(y_train_enc))
test_dataset = TicketDataset(test_encodings, list(y_test_enc))

# 4. Metrics

In [5]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# 5. Trainer Setup

In [6]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,  
    metric_for_best_model="f1",
    weight_decay=0.01,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_dir="./logs",
    report_to="none", 
)

# use smaller max length
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

train_dataset = TicketDataset(train_encodings, list(y_train_enc))
test_dataset = TicketDataset(test_encodings, list(y_test_enc))


data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


# 6. Early Stopping

In [7]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # add here
)


# 7. Train

In [8]:
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.64,0.453056,0.853679,0.854062
2,0.2999,0.371725,0.875314,0.875332
3,0.1765,0.397509,0.881584,0.881508




TrainOutput(global_step=3588, training_loss=0.3721255341766936, metrics={'train_runtime': 1196.5063, 'train_samples_per_second': 95.952, 'train_steps_per_second': 2.999, 'total_flos': 3802452983949312.0, 'train_loss': 0.3721255341766936, 'epoch': 3.0})

# 8. Save model + label encoder

In [9]:
model.save_pretrained("./transformer_model")
tokenizer.save_pretrained("./transformer_model")
import joblib
joblib.dump(le, "./transformer_model/label_encoder.pkl")

['./transformer_model/label_encoder.pkl']