In [3]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
n_epochs = 5
l_rate = 5e-5
w_decay = 0.001

In [6]:
path = r"C:\Users\alepa\Desktop\DB_Alessandro.xlsx"

In [7]:
data = pd.read_excel(path, header=None)
data = data.iloc[:, :156]
labels = data.iloc[0, 0::2].tolist()
speeches = data.iloc[0, 1::2].tolist()

labels = [int(label) for label in labels]

data = pd.DataFrame({'label': labels, 'speech': speeches})

In [8]:
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['label'])

In [9]:
class SpeechDataset(Dataset):
    def __init__(self, speeches, labels, tokenizer, max_len=512):
        self.speeches = speeches
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.speeches)

    def __getitem__(self, item):
        speech = str(self.speeches[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            speech,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = SpeechDataset(train_data['speech'].tolist(), train_data['label'].tolist(), tokenizer)
test_dataset = SpeechDataset(test_data['speech'].tolist(), test_data['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.train()
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:

optimizer = torch.optim.AdamW(model.parameters(),
                              lr=l_rate,
                              weight_decay=w_decay)

In [12]:
def live_plot(train_data, test_data, figsize=(10, 5), title=''):
    clear_output(wait=True)

    # Create a DataFrame for easier plotting with seaborn
    data_train = pd.DataFrame(train_data)
    data_train['Type'] = 'Train'
    data_test = pd.DataFrame(test_data)
    data_test['Type'] = 'Test'
    data = pd.concat([data_train, data_test], axis=0)
    data.reset_index(inplace=True)
    data.rename(columns={'index': 'Epoch'}, inplace=True)

    sns.set(style='dark')

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

    sns.lineplot(x='Epoch', y='Loss', hue='Type',
                 data=data.melt(id_vars=['Epoch', 'Type'], value_vars=['Train Loss', 'Test Loss'], var_name='Metric',
                              value_name='Loss'), ax=ax1)
    ax1.set_title('Loss')
    ax1.set_xlabel('Epoch')
    ax1.legend(loc='upper right')

    sns.lineplot(x='Epoch', y='Accuracy', hue='Type',
                 data=data.melt(id_vars=['Epoch', 'Type'], value_vars=['Train Accuracy', 'Test Accuracy'],
                              var_name='Metric', value_name='Accuracy'), ax=ax2)
    ax2.set_title('Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.legend(loc='lower right')

    plt.suptitle(title)
    plt.tight_layout()
    plt.show()


train_metrics = {'Train Loss': [], 'Train Accuracy': []}
test_metrics = {'Test Loss': [], 'Test Accuracy': []}

train_losses, train_accuracies = [], []
test_losses, test_accuracies = [], []

In [13]:

for epoch in range(n_epochs):
    model.train()
    total_train_loss, total_train_accuracy = 0, 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        total_train_accuracy += (outputs.logits.argmax(dim=-1) == batch['labels']).float().mean().item()

    train_metrics['Train Loss'].append(total_train_loss / len(train_loader))
    train_metrics['Train Accuracy'].append(total_train_accuracy / len(train_loader))

    model.eval()
    total_test_loss, total_test_accuracy = 0, 0
    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            total_test_loss += loss.item()
            total_test_accuracy += (outputs.logits.argmax(dim=-1) == batch['labels']).float().mean().item()

    test_metrics['Test Loss'].append(total_test_loss / len(test_loader))
    test_metrics['Test Accuracy'].append(total_test_accuracy / len(test_loader))

    live_plot(train_metrics, test_metrics, title='Real-time Training and Testing Metrics')

In [30]:
def predict_speech(speech, model, tokenizer):
    inputs = tokenizer.encode_plus(
        speech,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)


    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze(0)
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_label = probabilities.argmax(dim=-1).item()
        confidence = probabilities.max().item()


    label_map = {0: "Not Populist", 1: "Populist"}
    print(f"Predicted Label: {label_map[predicted_label]} (Confidence: {confidence:.2f})")


user_speech = input("Enter a speech to classify: ")
predict_speech(user_speech, model, tokenizer)


Predicted Label: Populist (Confidence: 0.97)
