In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/data_train.csv')

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

label_to_id = {label: idx for idx, label in enumerate(set(train_labels))}
train_labels = [label_to_id[label] for label in train_labels]
val_labels = [label_to_id[label] for label in val_labels]
id_to_label = {v: k for k, v in label_to_id.items()}


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
import torch
from torch.utils.data import Dataset

class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IntentDataset(train_encodings, train_labels)
val_dataset = IntentDataset(val_encodings, val_labels)


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_to_id))


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()




Epoch,Training Loss,Validation Loss
1,No log,0.63157
2,No log,0.401687
3,No log,0.288441
4,No log,0.214875
5,No log,0.15149
6,No log,0.10259
7,No log,0.066076
8,No log,0.061331
9,No log,0.055772
10,No log,0.042579


TrainOutput(global_step=144, training_loss=0.20593655109405518, metrics={'train_runtime': 632.8321, 'train_samples_per_second': 1.82, 'train_steps_per_second': 0.228, 'total_flos': 7104126062592.0, 'train_loss': 0.20593655109405518, 'epoch': 12.0})

In [None]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)


Evaluation results: {'eval_loss': 0.03598760813474655, 'eval_runtime': 1.9758, 'eval_samples_per_second': 12.653, 'eval_steps_per_second': 2.025, 'epoch': 12.0}


In [None]:
model.save_pretrained('./intent_classifier')
tokenizer.save_pretrained('./intent_classifier')


('./intent_classifier/tokenizer_config.json',
 './intent_classifier/special_tokens_map.json',
 './intent_classifier/vocab.txt',
 './intent_classifier/added_tokens.json')

In [None]:
import torch

def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return id_to_label[predicted_class]

print(predict_intent("Can you tell me the price of this product?"))
print(predict_intent("Hello, I need some assistance!"))
print(predict_intent("Show me items under $50."))
print(predict_intent("Could you describe this product?"))
print(predict_intent("What is the status of my order?"))


price inquiry
greeting
price range inquiry
description
description


In [None]:

def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    probabilities = torch.softmax(outputs.logits, dim=1)
    confidence = probabilities[0][predicted_class].item()
    return id_to_label[predicted_class], confidence

print(predict_intent("Can you tell me the price of this product?"))
print(predict_intent("Hello, I need some assistance!"))
print(predict_intent("Show me items under $50."))
print(predict_intent("Could you describe this product?"))
print(predict_intent("What happen with my account"))

('price inquiry', 0.977112889289856)
('greeting', 0.9901940226554871)
('price range inquiry', 0.9703112840652466)
('description', 0.9706349968910217)
('description', 0.9010961055755615)


In [None]:

from google.colab import files
import shutil

shutil.make_archive('intent_classifier', 'zip', './intent_classifier')

files.download('intent_classifier.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>