In [3]:
from datasets import load_dataset, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score


In [None]:
NUM_Labels = 2
Model_Name = "models/SemanticVAD_3.pt"
EPOCHS = 3

#### Load Data

In [None]:
from datasets import Value

df = pd.read_csv("dataset/train_1500.csv")
df['text'] = df['text'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
dataset = Dataset.from_pandas(df)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)


encoded_dataset = dataset.map(preprocess_function, batched=True)

encoded_dataset = encoded_dataset.train_test_split(test_size=0.2)
encoded_dataset = encoded_dataset.rename_column("labels", "label")
encoded_dataset = encoded_dataset.cast_column("label", Value("float32"))

encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/1499 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1199 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

#### Construct Model

In [None]:
from transformers import DistilBertModel
import torch.nn as nn

class DistilBERTBackchannelScorer(nn.Module):
    def __init__(self, hidden_dim=768):
        super().__init__()
        self.encoder = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # CLS token
        logits = self.classifier(pooled).squeeze(-1)  # shape: (batch_size,)
        loss = None
        if labels is not None:
            labels = labels.float()
            loss = nn.BCEWithLogitsLoss()(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

#### Training Setting

In [7]:
from transformers.modeling_outputs import SequenceClassifierOutput
model =DistilBERTBackchannelScorer()
training_args = TrainingArguments(
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    output_dir="./checkpoints",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="tensorboard"
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


#### Train

In [8]:
trainer.train()
trainer.evaluate()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5357,0.179368,0.95,0.970297
2,0.0764,0.115211,0.973333,0.984436
3,0.0117,0.099011,0.98,0.988281




{'eval_loss': 0.09901116788387299,
 'eval_accuracy': 0.98,
 'eval_f1': 0.98828125,
 'eval_runtime': 0.5588,
 'eval_samples_per_second': 536.89,
 'eval_steps_per_second': 134.222,
 'epoch': 3.0}

In [None]:
# save model
torch.save(model.state_dict(), Model_Name)


#### Quick Check

In [10]:
from transformers import DistilBertTokenizerFast
import torch

# load model
model = DistilBERTBackchannelScorer()
model.load_state_dict(torch.load(Model_Name))
model.eval()

# load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        logit = outputs.logits.squeeze()
        prob = torch.sigmoid(logit).item()
        label = 1 if prob > 0.5 else 0
    return prob, ("Backchannel" if label == 0 else "Interruption")


In [11]:
print(predict("yeah, right, I see"))
print(predict("Are you sure?"))
print(predict("I don't think so. You should say something different."))
print(predict("There is a long..."))
print(predict("wait, can I jump in here?"))

(0.003060677321627736, 'Backchannel')
(0.032934341579675674, 'Backchannel')
(0.9972732663154602, 'Interruption')
(0.9980196952819824, 'Interruption')
(0.9977831244468689, 'Interruption')


In [12]:
import time
a = time.time()
print(predict("wait, can I jump in here?"))
b = time.time()
print(f"Ref time: {b-a}")

(0.9977831244468689, 'Interruption')
Ref time: 0.020711898803710938


In [13]:
import time
a = time.time()
print(predict("Gotcha"))
b = time.time()
print(f"Ref time: {b-a}")

(0.004400672856718302, 'Backchannel')
Ref time: 0.01857900619506836
