In [1]:
import json
import pandas as pd
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

# JSON file
file_path = 'MultiWoZ_data/combined_validate.json'

with open(file_path) as f:
    data = json.load(f)

utterances_validate = []
slot_types_validate = []

for dialogue in data:
    for turn in dialogue["turns"]:
        if turn["speaker"] == "USER":
            current_slots = set()
            for frame in turn.get("frames", []):
                # Exclude "hotel-stars" slot value
                current_slots.update(slot for slot in frame.get("state", {}).get("slot_values", {}).keys() if slot != "hotel-stars")
            if current_slots:  # only slot_values present
                utterances_validate.append(turn["utterance"])
                slot_types_validate.append(list(current_slots))

df_validated = pd.DataFrame({'utterance': utterances_validate, 'slot': slot_types_validate})


In [2]:
with open('MultiWoZ_data/combined_train.json', 'r') as file: 
    data = json.load(file)

utterances_train = []
slot_types_train = []

for dialogue in data:
    for turn in dialogue["turns"]:
        if turn["speaker"] == "USER":
            current_slots = set()
            for frame in turn.get("frames", []):
                # Exclude "hotel-stars" slot value
                current_slots.update(slot for slot in frame.get("state", {}).get("slot_values", {}).keys() if slot != "hotel-stars")
            if current_slots:  # Only turns where slot_values present
                utterances_train.append(turn["utterance"])
                slot_types_train.append(list(current_slots))

df_train = pd.DataFrame({'utterance': utterances_train, 'slot': slot_types_train})

In [3]:
with open('MultiWoZ_data/combined_test.json', 'r') as file:
    data = json.load(file)

utterances_test = []
slot_types_test = []

for dialogue in data:
    for turn in dialogue["turns"]:
        if turn["speaker"] == "USER":
            current_slots = set()
            for frame in turn.get("frames", []):
                # Exclude "hotel-stars" slot value
                current_slots.update(slot for slot in frame.get("state", {}).get("slot_values", {}).keys() if slot != "hotel-stars")
            if current_slots:  # Only turns where slot_values present
                utterances_test.append(turn["utterance"])
                slot_types_test.append(list(current_slots))

df_test = pd.DataFrame({'utterance': utterances_test, 'slot': slot_types_test})

In [4]:

all_slots = pd.concat([
    df_train['slot'].explode(),
    df_validated['slot'].explode(),
    df_test['slot'].explode()
]).unique()

# Dictionary mapping each unique slot to a unique index
slot_labels = {slot: idx for idx, slot in enumerate(all_slots)}

df_train['labels'] = df_train['slot'].apply(lambda slots: [slot_labels[slot] for slot in slots])
df_validated['labels'] = df_validated['slot'].apply(lambda slots: [slot_labels[slot] for slot in slots])
df_test['labels'] = df_test['slot'].apply(lambda slots: [slot_labels[slot] for slot in slots])

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=sorted(slot_labels.values()))
train_labels = mlb.fit_transform(df_train['labels'])
validate_labels = mlb.transform(df_validated['labels'])
test_labels = mlb.transform(df_test['labels'])


In [5]:
assert df_train['labels'].isna().sum() == 0, "Missing labels in training data"
assert df_validated['labels'].isna().sum() == 0, "Missing labels in validation data"
assert df_test['labels'].isna().sum() == 0, "Missing labels in test data"


In [6]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='BERT_cache_folder')

def encode_data(tokenizer, texts, max_length=128):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

# Encode 
encoded_inputs_train = encode_data(tokenizer, df_train['utterance'].tolist())
encoded_inputs_validate = encode_data(tokenizer, df_validated['utterance'].tolist())
encoded_inputs_test = encode_data(tokenizer, df_test['utterance'].tolist())


In [7]:
# Example check to ensure alignment
assert len(encoded_inputs_train['input_ids']) == len(train_labels), "Mismatch in training data and labels count."
assert len(encoded_inputs_validate['input_ids']) == len(validate_labels), "Mismatch in validation data and labels count."
assert len(encoded_inputs_test['input_ids']) == len(test_labels), "Mismatch in test data and labels count."


In [8]:
import torch
from torch.utils.data import Dataset

class UtteranceDataset(Dataset):
    def __init__(self, encodings, labels):
        
        self.encodings = {k: v if isinstance(v, torch.Tensor) else torch.tensor(v) for k, v in encodings.items()}
        
        self.labels = torch.tensor(labels, dtype=torch.float) if not isinstance(labels, torch.Tensor) else labels

    def __getitem__(self, idx):

        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

dataset_train = UtteranceDataset(encoded_inputs_train, train_labels)
dataset_validate = UtteranceDataset(encoded_inputs_validate, validate_labels)
dataset_test = UtteranceDataset(encoded_inputs_test, test_labels)


In [9]:

if torch.cuda.is_available():
    device = torch.device("cuda:2")
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available, using CPU instead.")


CUDA is available. Using GPU: NVIDIA GeForce GTX 1080 Ti


In [10]:
from transformers import TrainerCallback, TrainerState, TrainerControl
from tqdm.auto import tqdm

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.best_loss = float('inf')
        self.early_stop_counter = 0
        self.progress_bar = None

    def on_train_begin(self, args, state, control, **kwargs):
        print("Starting training...")
        self.progress_bar = tqdm(total=state.num_train_epochs)

    def on_epoch_begin(self, args, state, control, **kwargs):
        # No action needed at the beginning of each epoch
        pass

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            current_loss = logs.get('eval_loss')
            if current_loss:
                if current_loss < self.best_loss:
                    self.best_loss = current_loss
                    self.early_stop_counter = 0
                else:
                    self.early_stop_counter += 1
                    if self.early_stop_counter >= self.patience:
                        control.should_training_stop = True

    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        self.progress_bar.update(1)

        if logs:
            print(f"\nEpoch {state.epoch + 1} Summary:")
            if 'loss' in logs:
                print(f"Training Loss: {logs['loss']:.4f}")
            if 'eval_loss' in logs:
                print(f"Validation Loss: {logs['eval_loss']:.4f}")
            if 'eval_accuracy' in logs:
                print(f"Validation Accuracy: {logs['eval_accuracy']:.4f}")
            print(f"Best Validation Loss So Far: {self.best_loss:.4f}")
            if self.early_stop_counter > 0:
                print(f"No improvement in validation loss for {self.early_stop_counter} consecutive epoch(s).")
            if control.should_training_stop:
                print("Early stopping triggered.")

    def on_train_end(self, args, state, control, **kwargs):
        self.progress_bar.close()
        print("Training completed.")


In [15]:
from transformers import BertForSequenceClassification


num_labels = train_labels.shape[1]  

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)  


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [16]:
import torch
from torch.utils.data import DataLoader
from transformers import get_scheduler
from torch.optim import AdamW
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm

# Setup DataLoader and other components
train_loader = DataLoader(dataset_train, batch_size=8, shuffle=True, drop_last=True)
validate_loader = DataLoader(dataset_validate, batch_size=8, shuffle=False, drop_last=True)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 10  # Assuming 10 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)
loss_function = BCEWithLogitsLoss()

# Best loss tracking
best_validation_loss = float('inf')

# Training loop
progress_bar = tqdm(total=num_training_steps, desc="Training progress")
model.train()

for epoch in range(10):  
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop('labels')  
        outputs = model(**batch)
        logits = outputs.logits  
        loss = loss_function(logits, labels.float())  
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    # Validation at the end of each epoch
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in validate_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop('labels')
            outputs = model(**batch)
            logits = outputs.logits
            loss = loss_function(logits, labels.float())
            total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(validate_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}")

    # Update the best loss and save model if it's the best
    if avg_val_loss < best_validation_loss:
        best_validation_loss = avg_val_loss
        print(f"New Best Validation Loss: {best_validation_loss}")
        model.save_pretrained("./model_save_slots")
        tokenizer.save_pretrained("./model_save_slots")

progress_bar.close()


Training progress:  10%|█████████▋                                                                                       | 485/4850 [00:54<08:13,  8.85it/s]

Epoch 1, Validation Loss: 0.3889070252577464
New Best Validation Loss: 0.3889070252577464


Training progress:  20%|███████████████████▍                                                                             | 970/4850 [01:54<07:16,  8.89it/s]

Epoch 2, Validation Loss: 0.3489765190716946
New Best Validation Loss: 0.3489765190716946


Training progress:  30%|████████████████████████████▊                                                                   | 1455/4850 [02:56<06:49,  8.30it/s]

Epoch 3, Validation Loss: 0.3426017783807986
New Best Validation Loss: 0.3426017783807986


Training progress:  40%|██████████████████████████████████████▍                                                         | 1940/4850 [03:58<05:35,  8.68it/s]

Epoch 4, Validation Loss: 0.32464923957983655
New Best Validation Loss: 0.32464923957983655


Training progress:  50%|████████████████████████████████████████████████                                                | 2427/4850 [05:03<16:15,  2.48it/s]

Epoch 5, Validation Loss: 0.3369812152602456


Training progress:  60%|█████████████████████████████████████████████████████████▋                                      | 2912/4850 [06:01<13:01,  2.48it/s]

Epoch 6, Validation Loss: 0.35189073419932165


Training progress:  70%|███████████████████████████████████████████████████████████████████▏                            | 3397/4850 [07:00<09:45,  2.48it/s]

Epoch 7, Validation Loss: 0.3817657209707029


Training progress:  80%|████████████████████████████████████████████████████████████████████████████▊                   | 3882/4850 [07:59<06:29,  2.48it/s]

Epoch 8, Validation Loss: 0.40987531028010626


Training progress:  90%|██████████████████████████████████████████████████████████████████████████████████████▍         | 4367/4850 [08:58<03:14,  2.48it/s]

Epoch 9, Validation Loss: 0.4448308405099493


Training progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 4850/4850 [09:57<00:00,  8.12it/s]

Epoch 10, Validation Loss: 0.46458253518424253





In [18]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from torch.utils.data import DataLoader

test_loader = DataLoader(dataset_test, batch_size=8, shuffle=False, drop_last=True)

model.eval()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

predictions, true_labels = [], []

# Evaluate the model
with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)  
        preds = (probs > 0.5).int()  
        predictions.append(preds.detach().cpu().numpy())
        true_labels.append(labels.detach().cpu().numpy())

# Convert list of arrays to single numpy arrays
predictions = np.vstack(predictions)
true_labels = np.vstack(true_labels)

# Calculate metrics, consider each label independently
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')  
accuracy = accuracy_score(true_labels, predictions)  

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.3956043956043956
Precision: 0.7264565494894015
Recall: 0.691124241129953
F1 Score: 0.7051808157194924


In [20]:

# Convert binary predictions and true labels back to label format
predicted_labels = mlb.inverse_transform(predictions)
true_labels_readable = mlb.inverse_transform(true_labels)

# Ensure target names are strings for classification report
target_names = [str(cls) for cls in mlb.classes_]

# Evaluate performance using the original label format
print("Final Test Set Results:")
print(classification_report(true_labels, predictions, target_names=target_names))

# Calculate additional metrics
hamming_loss_value = hamming_loss(true_labels, predictions)
print("Hamming Loss:", hamming_loss_value)

jaccard = jaccard_score(true_labels, predictions, average='samples')
print("Jaccard Score:", jaccard)

Final Test Set Results:
              precision    recall  f1-score   support

           0       0.65      0.62      0.64       183
           1       0.69      0.49      0.57       186
           2       0.67      0.70      0.68       222
           3       0.59      0.66      0.62       178
           4       0.81      0.68      0.74       167
           5       0.80      0.68      0.74       165
           6       0.77      0.68      0.72       162
           7       0.74      0.69      0.72       175
           8       0.63      0.67      0.65       200
           9       0.82      0.89      0.86       283
          10       0.62      0.65      0.64       130
          11       0.81      0.87      0.84       276
          12       0.82      0.87      0.85       277
          13       0.71      0.63      0.67        90
          14       0.75      0.56      0.64       146

   micro avg       0.73      0.71      0.72      2840
   macro avg       0.73      0.69      0.71      2840
we

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
import numpy as np
from sklearn.metrics import classification_report, hamming_loss, jaccard_score
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader

# Load the model and tokenizer
model_path = "model_save_synth_slots"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

test_loader = DataLoader(dataset_test, batch_size=8, shuffle=False, drop_last=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

predictions, true_labels = [], []

# Evaluate model
with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)  # Use sigmoid to output probabilities
        preds = (probs > 0.5).int()  # Convert probabilities to binary output
        predictions.append(preds.detach().cpu().numpy())
        true_labels.append(labels.detach().cpu().numpy())

# Convert predictions and true labels to a suitable format using MLB
predictions = np.vstack(predictions)
true_labels = np.vstack(true_labels)

# Convert binary predictions and true labels back to label format
predicted_labels = mlb.inverse_transform(predictions)
true_labels_readable = mlb.inverse_transform(true_labels)

# Ensure target names are strings for classification report
target_names = [str(cls) for cls in mlb.classes_]

# Evaluate performance using the original label format
print("Final Test Set Results:")
print(classification_report(true_labels, predictions, target_names=target_names))

# Calculate additional metrics
hamming_loss_value = hamming_loss(true_labels, predictions)
print("Hamming Loss:", hamming_loss_value)

jaccard = jaccard_score(true_labels, predictions, average='samples')
print("Jaccard Score:", jaccard)


Final Test Set Results:
              precision    recall  f1-score   support

           0       0.05      0.02      0.02       183
           1       0.05      0.02      0.03       222
           2       0.06      0.06      0.06       178
           3       0.00      0.00      0.00       186
           4       0.00      0.00      0.00       167
           5       0.00      0.00      0.00       165
           6       0.46      0.48      0.47       162
           7       0.00      0.00      0.00       175
           8       1.00      0.01      0.02       200
           9       0.18      0.04      0.06       131
          10       0.07      0.04      0.05       284
          11       0.03      0.00      0.01       277
          12       0.50      0.00      0.01       278
          13       0.00      0.00      0.00        91
          14       0.00      0.00      0.00       147

   micro avg       0.16      0.04      0.06      2846
   macro avg       0.16      0.04      0.05      2846
we

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
