In [1]:
import json
import pandas as pd
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

# JSON file
file_path = 'MultiWoZ_data/combined_validate.json'

with open(file_path) as f:
    data = json.load(f)

utterances = []
intents = []

for dialogue in data:
    for turn in dialogue['turns']:
        if turn['speaker'] == 'USER':  
            for frame in turn.get("frames",[]):
                active_intent= frame["state"]["active_intent"]

                if active_intent != "NONE":
                    utterances.append(turn["utterance"])
                    intents.append(active_intent)

df_validated = pd.DataFrame({'utterance': utterances, 'intent': intents})


In [2]:
# JSON file
file_path = 'MultiWoZ_data/combined_test.json'


with open(file_path) as f:
    data = json.load(f)


utterances = []
intents = []

for dialogue in data:
    for turn in dialogue['turns']:
        if turn['speaker'] == 'USER':  
            for frame in turn.get("frames",[]):
                service=frame.get("service")
                active_intent= frame["state"]["active_intent"]

                if active_intent != "NONE":
                    utterances.append(turn["utterance"])
                    intents.append(active_intent)


df_test = pd.DataFrame({'utterance': utterances, 'intent': intents})

In [3]:
# JSON file
file_path = 'MultiWoZ_data/combined_train.json'


with open(file_path) as f:
    data = json.load(f)


utterances = []
intents = []

for dialogue in data:
    for turn in dialogue['turns']:
        if turn['speaker'] == 'USER':  
            for frame in turn.get("frames",[]):
                service=frame.get("service")
                active_intent= frame["state"]["active_intent"]

                if active_intent != "NONE":
                    utterances.append(turn["utterance"])
                    intents.append(active_intent)

df_train = pd.DataFrame({'utterance': utterances, 'intent': intents})

In [4]:

all_intents = pd.concat([df_train['intent'], df_validated['intent'], df_test['intent']]).unique()
intent_labels = {intent: idx for idx, intent in enumerate(all_intents)}

# Map labels according to the unified dictionary
df_train['label'] = df_train['intent'].map(intent_labels)
df_validated['label'] = df_validated['intent'].map(intent_labels)
df_test['label'] = df_test['intent'].map(intent_labels) 



train_labels = df_train['label'].tolist()
validate_labels = df_validated['label'].tolist()
test_labels = df_test['label'].tolist() 


In [5]:
# Quick check to ensure no labels are missing or incorrectly mapped
assert df_train['label'].isna().sum() == 0, "Missing labels in training data"
assert df_validated['label'].isna().sum() == 0, "Missing labels in validation data"
assert df_test['label'].isna().sum() == 0, "Missing labels in test data"


In [6]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='BERT_cache_folder')

def encode_data(tokenizer, texts, max_length=128):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

# Encode 
encoded_inputs_train = encode_data(tokenizer, df_train['utterance'].tolist())
encoded_inputs_validate = encode_data(tokenizer, df_validated['utterance'].tolist())

encoded_inputs_test = encode_data(tokenizer, df_test['utterance'].tolist())


In [7]:
# Check to ensure alignment
assert len(encoded_inputs_train['input_ids']) == len(train_labels), "Mismatch in training data and labels count."
assert len(encoded_inputs_validate['input_ids']) == len(validate_labels), "Mismatch in validation data and labels count."
assert len(encoded_inputs_test['input_ids']) == len(test_labels), "Mismatch in test data and labels count."


In [8]:
class UtteranceDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() if torch.is_tensor(val[idx]) else val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels) 


dataset_train = UtteranceDataset(encoded_inputs_train, train_labels)
dataset_validate = UtteranceDataset(encoded_inputs_validate, validate_labels)
dataset_test = UtteranceDataset(encoded_inputs_test, test_labels)



In [9]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available, using CPU instead.")


CUDA is available. Using GPU: NVIDIA GeForce GTX 1080 Ti


In [10]:
from transformers import TrainerCallback, TrainerState, TrainerControl
from tqdm.auto import tqdm

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.best_loss = float('inf')
        self.early_stop_counter = 0
        self.progress_bar = None

    def on_train_begin(self, args, state, control, **kwargs):
        print("Starting training...")
        self.progress_bar = tqdm(total=state.num_train_epochs)

    def on_epoch_begin(self, args, state, control, **kwargs):
        # No action needed 
        pass

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            current_loss = logs.get('eval_loss')
            if current_loss:
                if current_loss < self.best_loss:
                    self.best_loss = current_loss
                    self.early_stop_counter = 0
                    
                else:
                    self.early_stop_counter += 1
                    if self.early_stop_counter >= self.patience:
                        control.should_training_stop = True

    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        self.progress_bar.update(1)
        # Consolidate print statements to show epoch summaries and best loss
        if logs:
            print(f"\nEpoch {state.epoch + 1} Summary:")
            if 'loss' in logs:
                print(f"Training Loss: {logs['loss']:.4f}")
            if 'eval_loss' in logs:
                print(f"Validation Loss: {logs['eval_loss']:.4f}")
            if 'eval_accuracy' in logs:
                print(f"Validation Accuracy: {logs['eval_accuracy']:.4f}")
            print(f"Best Validation Loss So Far: {self.best_loss:.4f}")
            if self.early_stop_counter > 0:
                print(f"No improvement in validation loss for {self.early_stop_counter} consecutive epoch(s).")
            if control.should_training_stop:
                print("Early stopping triggered.")

    def on_train_end(self, args, state, control, **kwargs):
        self.progress_bar.close()
        print("Training completed.")


In [11]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_labels))

model.to(device) 


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
import torch
from torch.utils.data import DataLoader
from transformers import get_scheduler
from torch.optim import AdamW
from tqdm import tqdm

# Setup DataLoader and other components
train_loader = DataLoader(dataset_train, batch_size=32, shuffle=True, drop_last=True)
validate_loader = DataLoader(dataset_validate, batch_size=32, shuffle=False, drop_last=True)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 10
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

best_validation_loss = float('inf')

progress_bar = tqdm(total=num_training_steps, desc="Training progress")
model.train()

for epoch in range(10):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in validate_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(validate_loader)
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    print(f"Validation Loss: {avg_val_loss}")

    # Update the best loss and save model if it's the best
    if avg_val_loss < best_validation_loss:
        best_validation_loss = avg_val_loss
        print(f"New Best Validation Loss: {best_validation_loss}")
        model.save_pretrained("./model_save_intent")
        tokenizer.save_pretrained("./model_save_intent")

progress_bar.close()


Training progress:  10%|█████████▋                                                                                       | 102/1020 [00:37<05:35,  2.74it/s]

Epoch 1, Loss: 5.25622034072876
Validation Loss: 5.404687325159709
New Best Validation Loss: 5.404687325159709


Training progress:  20%|███████████████████▍                                                                             | 204/1020 [01:21<04:55,  2.76it/s]

Epoch 2, Loss: 1.5412678718566895
Validation Loss: 1.5010003646214802
New Best Validation Loss: 1.5010003646214802


Training progress:  30%|█████████████████████████████                                                                    | 306/1020 [02:05<04:22,  2.72it/s]

Epoch 3, Loss: 0.755513608455658
Validation Loss: 0.5077585478623708
New Best Validation Loss: 0.5077585478623708


Training progress:  40%|██████████████████████████████████████▊                                                          | 408/1020 [02:49<03:50,  2.66it/s]

Epoch 4, Loss: 0.48762205243110657
Validation Loss: 0.47188234329223633
New Best Validation Loss: 0.47188234329223633


Training progress:  50%|████████████████████████████████████████████████▌                                                | 510/1020 [03:36<03:16,  2.60it/s]

Epoch 5, Loss: 0.47808510065078735
Validation Loss: 0.43526894350846607
New Best Validation Loss: 0.43526894350846607


Training progress:  60%|██████████████████████████████████████████████████████████▏                                      | 612/1020 [04:20<02:36,  2.61it/s]

Epoch 6, Loss: 0.35020872950553894
Validation Loss: 0.3960024764140447
New Best Validation Loss: 0.3960024764140447


Training progress:  70%|███████████████████████████████████████████████████████████████████▉                             | 715/1020 [05:06<03:26,  1.48it/s]

Epoch 7, Loss: 0.550471305847168
Validation Loss: 0.4458762047191461


Training progress:  80%|█████████████████████████████████████████████████████████████████████████████▋                   | 817/1020 [05:46<02:17,  1.48it/s]

Epoch 8, Loss: 0.6176126599311829
Validation Loss: 0.47262436896562576


Training progress:  90%|███████████████████████████████████████████████████████████████████████████████████████▍         | 919/1020 [06:27<01:11,  1.41it/s]

Epoch 9, Loss: 0.5305882096290588
Validation Loss: 0.4812395398815473


Training progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1020/1020 [07:08<00:00,  2.38it/s]

Epoch 10, Loss: 0.5477834939956665
Validation Loss: 0.4933062916000684





In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
test_loader = DataLoader(dataset_test, batch_size=32, shuffle=False, drop_last=True)  # Set drop_last to True to avoid partial batches


model.eval() 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

predictions, true_labels = [], []

# Evaluate the model
with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.detach().cpu().numpy())
        true_labels.extend(labels.detach().cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')  # adjust 'average' as needed

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.8947368421052632
Precision: 0.57334625302404
Recall: 0.597272813729782
F1 Score: 0.5840215632898292


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from collections import Counter

# Find the most common class
most_common_class = Counter(train_labels).most_common(1)[0][0]

# Predict this class for all validation instances
predictions = [most_common_class] * len(dataset_test)  

# True labels from the test set
true_labels_test = [batch['labels'].item() for batch in DataLoader(dataset_test, batch_size=1)]

# Calculate the accuracy of the baseline
baseline_accuracy = sum(1 for true, pred in zip(true_labels_test, predictions) if true == pred) / len(predictions)
print(f"Majority Class Baseline Accuracy: {baseline_accuracy:.4f}")

target_names = [None] * len(intent_labels)
for intent, idx in intent_labels.items():
    target_names[idx] = intent
    

from sklearn.metrics import classification_report

labels = list(intent_labels.values())
print(classification_report(true_labels_test, predictions, labels=labels, target_names=target_names))

Majority Class Baseline Accuracy: 0.3511
                 precision    recall  f1-score   support

     find_hotel       0.35      1.00      0.52       217
     book_hotel       0.00      0.00      0.00       125
find_restaurant       0.00      0.00      0.00         5
     find_train       0.00      0.00      0.00       222
    find_police       0.00      0.00      0.00         0
find_attraction       0.00      0.00      0.00         0
  find_hospital       0.00      0.00      0.00         1
     book_train       0.00      0.00      0.00        48
      find_taxi       0.00      0.00      0.00         0

      micro avg       0.35      0.35      0.35       618
      macro avg       0.04      0.11      0.06       618
   weighted avg       0.12      0.35      0.18       618



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Load the model and tokenizer
model_path = "model_save_intent" 
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
model.eval()


test_loader = DataLoader(dataset_test, batch_size=32, shuffle=False, drop_last=True)

predictions, true_labels = [], []


with torch.no_grad():
    for batch in test_loader:
        
        batch = {k: v.to(device) for k, v in batch.items()}

        labels = batch.pop('labels')
        outputs = model(**batch)
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        
        predictions.extend(preds.detach().cpu().numpy())
        true_labels.extend(labels.detach().cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.06414473684210527
Precision: 0.0760515873015873
Recall: 0.016005106763512515
F1 Score: 0.02342806809949626


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
from sklearn.metrics import classification_report

accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print("\nDetailed Classification Report:")
print(classification_report(true_labels, predictions))

Accuracy: 0.06414473684210527
Precision: 0.0760515873015873
Recall: 0.016005106763512515
F1 Score: 0.02342806809949626

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.14      0.25       217
           1       0.06      0.02      0.03       125
           2       0.00      0.00      0.00         5
           3       0.07      0.01      0.02       213
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.04      0.06      0.05        47
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
