# DistilBERT Intent Classifier
Fine-tune a DistilBERT model using HuggingFace Transformers for intent classification.

In [2]:
import pandas as pd
import numpy as np
import time
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from datasets import Dataset as HFDataset
import matplotlib.pyplot as plt
import os

start_time = time.time()

# Load and encode data
df = pd.read_csv("intent_dataset.csv")
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['intent'])
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


RuntimeError: Failed to import transformers.models.distilbert.modeling_distilbert because of the following error (look up to see its traceback):
module 'torch' has no attribute 'compiler'

In [None]:
# Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

train_dataset = HFDataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': y_train.tolist()
})

test_dataset = HFDataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': y_test.tolist()
})


In [None]:
# Model setup
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

trainer.train()


In [None]:
# Predictions and evaluation
preds = trainer.predict(test_dataset).predictions
y_pred = np.argmax(preds, axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(xticks_rotation=45)
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Save best model and label encoder
os.makedirs("C:/Users/Anirudh/Desktop/nlp_project/checkpoints/bert", exist_ok=True)
model.save_pretrained("C:/Users/Anirudh/Desktop/nlp_project/checkpoints/bert")
tokenizer.save_pretrained("C:/Users/Anirudh/Desktop/nlp_project/checkpoints/bert")
import joblib
joblib.dump(label_encoder, "C:/Users/Anirudh/Desktop/nlp_project/checkpoints/bert/label_encoder.pkl")
print("Model and tokenizer saved.")


In [None]:
# Runtime logging
end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds")
