<a href="https://colab.research.google.com/github/BishwaKandel/CollegeRS/blob/Rabi/intent_classification_mBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intent Classification using mBERT
This notebook demonstrates how to train an intent classification model using the multilingual BERT (mBERT) model from Huggingface Transformers.

In [3]:
!pip install transformers datasets scikit-learn torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import numpy as np


In [2]:
import pandas as pd

data = {
    "text": [
        "hi", "hello", "bye", "thank you", "thanks",
        "what is the college address", "where is the college located",
        "how many students are in the college", "what is the student strength",
        "tell me about civil engineering", "how many seats in civil",
        "fee for civil engineering", "duration of civil course",
        "criteria for civil admission",
        "tell me about computer engineering", "how many seats in computer",
        "fee for computer engineering", "duration of computer course",
        "criteria for computer admission",
        "tell me about electrical engineering", "how many seats in electrical",
        "fee for electrical engineering", "duration of electrical course",
        "criteria for electrical admission",
        "tell me about AI and Data Science", "how many seats in ai and ds",
        "fee for ai and ds", "duration of ai and ds course",
        "criteria for ai and ds admission",
        "tell me about mechanical engineering", "how many seats in mechanical",
        "fee for mechanical engineering", "duration of mechanical course",
        "criteria for mechanical admission",
        "what is the admission process", "when is the last date of admission",
        "do you offer scholarships", "how to apply for scholarships",
        "what is the contact number of the college", "how to reach the college",
        "what departments are available", "list all courses offered",
        "what is the pass to fail ratio", "placement ratio in college",
        "college name please", "what is the name of the engineering college",
        "can I visit the campus", "what are the visiting hours",
        "when is the best time to visit the college"
    ],
    "label": [
        "welcomegreeting", "welcomegreeting", "endgreeting", "thankgreet", "thankgreet",
        "address", "address",
        "student_strength", "student_strength",
        "civil_engineering", "civil_intake",
        "civil_fee", "civil_duration",
        "civil_criteria",
        "computer_engineering", "computer_intake",
        "computer_fee", "computer_duration",
        "computer_criteria",
        "electrical_engineering", "electrical_intake",
        "electrical_fee", "electrical_duration",
        "electrical_criteria",
        "ai&ds_engineering", "ai&ds_intake",
        "ai&ds_fee", "ai&ds_duration",
        "ai&ds_criteria",
        "mechanical_engineering", "mechanical_intake",
        "mechanical_fee", "mechanical_duration",
        "mechanical_criteria",
        "admission_enquiry", "admission_enquiry",
        "scholorship", "scholorship",
        "contact", "address",
        "course", "course",
        "pass_fail_ratio", "pass_fail_ratio",
        "college_name", "college_name",
        "campus_visit", "visit_hours",
        "visit_hours"
    ]
}

df = pd.DataFrame(data)
df.to_csv("intent_data.csv", index=False)
df.head()


Unnamed: 0,text,label
0,hi,welcomegreeting
1,hello,welcomegreeting
2,bye,endgreeting
3,thank you,thankgreet
4,thanks,thankgreet


In [4]:

df = pd.read_csv("intent_data.csv")
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
dataset = Dataset.from_pandas(df)
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess(example):
    return tokenizer(example['text'], truncation=True, padding=True)
dataset = dataset.map(preprocess)
dataset = dataset.train_test_split(test_size=0.2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

In [None]:

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_encoder.classes_)
)


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}


In [None]:

training_args = TrainingArguments(
    output_dir="./intent_model",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10
)


In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()


In [None]:

model.save_pretrained("intent_classifier_model")
tokenizer.save_pretrained("intent_classifier_model")


In [None]:

from transformers import pipeline

clf = pipeline("text-classification", model="intent_classifier_model", tokenizer="intent_classifier_model")

test_sentences = ["hi", "tell me about the college", "ai and ds fees", "bye"]
for sentence in test_sentences:
    result = clf(sentence)
    label_idx = int(result[0]['label'].split("_")[-1])
    predicted_label = label_encoder.inverse_transform([label_idx])
    print(f"Input: {sentence} --> Predicted Intent: {predicted_label[0]}")
