In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [7]:
# Load dataset, split data
df = pd.read_csv("intent_dataset.csv")

df['Intent Label'].value_counts()

Intent Label
Precise/Urgent Intent       134
Feedback/Opinion Intent     134
Navigational Intent         128
Confirmation Intent         127
Curious Intent              119
Comparative Intent          111
Support/Help Intent         111
Emotional Support Intent    107
Name: count, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["User Query"], df["Intent Label"], test_size=0.2, random_state=42)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a LinearSVC model
model = LinearSVC()
model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

      Comparative Intent       0.91      0.91      0.91        22
     Confirmation Intent       0.93      0.84      0.88        31
          Curious Intent       0.95      0.83      0.88        23
Emotional Support Intent       0.90      0.95      0.92        19
 Feedback/Opinion Intent       0.84      0.91      0.88        23
     Navigational Intent       0.86      0.82      0.84        22
   Precise/Urgent Intent       0.83      0.86      0.85        35
     Support/Help Intent       0.78      0.90      0.84        20

                accuracy                           0.87       195
               macro avg       0.88      0.88      0.87       195
            weighted avg       0.88      0.87      0.87       195



In [9]:
# Manual testing using custom prompt
new_query = "I want to order a cheeseburger."
new_query_tfidf = vectorizer.transform([new_query])
predicted_intent = model.predict(new_query_tfidf)
print(f"Predicted Intent: {predicted_intent[0]}")

Predicted Intent: Support/Help Intent


In [3]:
pip install pandas numpy scikit-learn torch transformers nltk

Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy
  Downloading numpy-2.2.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting torch
  Downloading torch-2.6.0-cp313-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Col

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))

    def preprocess(self, text):
        # Convert to lowercase
        text = text.lower()

        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and short words
        tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]

        return ' '.join(tokens)

class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class BERTIntentClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        return self.linear(dropout_output)

def train_bert_model(model, train_loader, val_loader, device, epochs=10):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Epoch {epoch+1}:')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')
        print(f'Validation Accuracy: {100 * correct/total:.2f}%\n')

# Load and preprocess data
df = pd.read_csv("intent_dataset.csv")
preprocessor = TextPreprocessor()
df['Processed_Query'] = df['User Query'].apply(preprocessor.preprocess)

# Encode labels
label_encoder = LabelEncoder()
df['Label_Encoded'] = label_encoder.fit_transform(df['Intent Label'])

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df['Processed_Query'], df['Label_Encoded'], test_size=0.2, random_state=42)

# BERT model
train_dataset = IntentDataset(X_train.values, y_train.values, tokenizer)
val_dataset = IntentDataset(X_val.values, y_val.values, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
bert_model = BERTIntentClassifier(len(label_encoder.classes_)).to(device)
train_bert_model(bert_model, train_loader, val_loader, device)

# Evaluate BERT model
bert_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = bert_model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

bert_score = correct / total

# Print final results
print("\nFinal Results:")
print(f"BERT Model Accuracy: {bert_score:.4f}")

[nltk_data] Downloading package punkt to /Users/vorad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/vorad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/vorad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/vorad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Epoch 1:
Training Loss: 2.0620
Validation Loss: 1.9170
Validation Accuracy: 35.90%

Epoch 2:
Training Loss: 1.6180
Validation Loss: 1.3041
Validation Accuracy: 69.23%

Epoch 3:
Training Loss: 0.9987
Validation Loss: 0.8999
Validation Accuracy: 76.41%

Epoch 4:
Training Loss: 0.5260
Validation Loss: 0.7549
Validation Accuracy: 81.03%

Epoch 5:
Training Loss: 0.2788
Validation Loss: 0.6429
Validation Accuracy: 84.62%

Epoch 6:
Training Loss: 0.1490
Validation Loss: 0.6789
Validation Accuracy: 84.10%

Epoch 7:
Training Loss: 0.0843
Validation Loss: 0.7167
Validation Accuracy: 84.10%

Epoch 8:
Training Loss: 0.0605
Validation Loss: 0.7143
Validation Accuracy: 83.59%

Epoch 9:
Training Loss: 0.0440
Validation Loss: 0.7043
Validation Accuracy: 85.64%

Epoch 10:
Training Loss: 0.0300
Validation Loss: 0.7152
Validation Accuracy: 84.62%


Final Results:
BERT Model Accuracy: 0.8462


In [6]:
import joblib
import torch

# Save model state
torch.save(bert_model.state_dict(), "bert_intent_classifier.pth")

# Save LabelEncoder
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [35]:
import torch
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a function to make predictions
def predict_intent(model, text, tokenizer, label_encoder, device):
    model.eval()  # Set the model to evaluation mode
    
    # Tokenize input text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Run inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
    
    # Convert predicted label index to intent label
    predicted_label = label_encoder.inverse_transform([predicted.item()])[0]
    
    return predicted_label



In [36]:
# Example test prompt
test_prompt = "Where can I find cheese?"
predicted_intent = predict_intent(bert_model, test_prompt, tokenizer, label_encoder, device)

print(f"Predicted Intent: {predicted_intent}")

Predicted Intent: Support/Help Intent
