# huBERTusHeil

Deep Learning on German Federal Parliament Speeches.

In [None]:
# Prerequisites
! pip install torch 
! pip install transformers
! pip install shap

In [None]:
# DL Libs
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

In [None]:
# Data Science Libs
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

In [None]:
# Constants
PRE_TRAINED_MODEL_NAME = 'bert-base-german-cased'

## Preprocessing

In [None]:
%load preprocess_speeches.py

## Tokenization / Encoding

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
df = pd.read_csv('rede_fraktion_preprocessed.csv.gz')

In [None]:
df.fraktion.value_counts()

In [None]:
import numpy as np
weights = np.clip( df.fraktion.value_counts().max() / df.fraktion.value_counts()[["AfD", "B90", "FDP", "Linke", "SPD", "Union", "fraktionslos"]].values, 0, 10)
weights

In [None]:
encoding = tokenizer.encode_plus(
  df.loc[3632]['text'],
  max_length=3000,
  truncate=True,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [None]:
MAX_LEN = 512


def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for i, d in tqdm(enumerate(data), total=data.shape[0]):
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        #print(f'processing sample {i} of {len(data)}')
        encoded = tokenizer.encode_plus(
            text=d,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,             # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded.get('input_ids'))
        attention_masks.append(encoded.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [None]:
from sklearn.model_selection import train_test_split

X = df['text']
y = pd.get_dummies(df_filtered['fraktion']).values

X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.1, random_state=2346)

train_idxs, val_idxs = train_test_split(np.arange(df.shape[0]), test_size=0.1, random_state=2346)

X_val.isnull().values.any()

In [None]:
y_train = y_train.astype(np.long)
y_val = y_val.astype(np.long)
np.savez("y_preprocessed.npz", y_train=y_train, y_val=y_val)

In [None]:
# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)
np.savez("X_preprocessed.npz", train_inputs=train_inputs, train_masks=train_masks, val_inputs=val_inputs, val_masks=val_masks)

In [None]:
tokenizer.save_pretrained("tokenizer")

## Data Loading

In [None]:
import numpy as np
with np.load("y_preprocessed.npz") as npzf:
    y_train, y_val = npzf["y_train"], npzf["y_val"]
with np.load("X_preprocessed.npz") as npzf:
    train_inputs, train_masks, val_inputs, val_masks = npzf["train_inputs"], npzf["train_masks"], npzf["val_inputs"], npzf["val_masks"]

In [None]:
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
val_inputs = torch.tensor(val_inputs)
val_masks = torch.tensor(val_masks)

In [None]:
print(y_val.shape, y_train.shape)
print(y_val)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

## Model

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import torch.nn as nn
# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = self.bert.config.hidden_size, 25, 7

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.BatchNorm1d(25),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

## Training

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels.argmax(axis=1))
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels.argmax(axis=1))
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels.argmax(axis=1)).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=3)
train(bert_classifier, train_dataloader, val_dataloader, epochs=3, evaluation=True)

In [None]:
def get_curr_timestamp():
    import datetime
    return datetime.datetime.now().strftime("%Y%m%d-%H%M")
torch.save(bert_classifier.state_dict(), f"{get_curr_timestamp()}-3ep.pt")

## Load saved model

In [None]:
bert_classifier = BertClassifier(freeze_bert=False)
bert_classifier.load_state_dict(torch.load("20210809-1337-4epochs.pt"))
bert_classifier.to(device)

## Model Evaluation: Confusion Matrix

In [None]:
def evaluate_confusion_matrix(model, dataloader):
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    y_pred = []
    y_true = []

    # For each batch in our validation set...
    for batch in tqdm(dataloader):
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()
        
        y_true.append(b_labels.argmax(axis=1).cpu().numpy())
        y_pred.append(preds.cpu().numpy())

    # Compute the average accuracy and loss over the validation set.
    return np.concatenate(y_pred), np.concatenate(y_true)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
pred, true = evaluate_confusion_matrix(bert_classifier, val_dataloader)
disp = ConfusionMatrixDisplay(confusion_matrix(true, pred), display_labels=["AfD", "B90", "FDP", "Linke", "SPD", "Union", "fraktionslos"])
disp.plot()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(true, pred))

## Model Evaluation: Latent Space

In [None]:
def get_latent_vector(model, input_ids, attention_mask):
    # Feed input to BERT
    outputs = model.bert(input_ids=input_ids,
                        attention_mask=attention_mask)

    # Extract the last hidden state of the token `[CLS]` for classification task
    last_hidden_state_cls = outputs[0][:, 0, :]

    # Feed input to first layer of classifier
    return model.classifier[0](last_hidden_state_cls)
def evaluate_latent_space(model, dataloader):
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    speeches = []
    y_true = []

    # For each batch in our validation set...
    for batch in tqdm(dataloader):
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            vec = get_latent_vector(model, b_input_ids, b_attn_mask)

        speeches.append(vec.cpu().numpy())
        y_true.append(b_labels.argmax(axis=1).cpu().numpy())

    # Compute the average accuracy and loss over the validation set.
    return np.concatenate(speeches), np.concatenate(y_true)

### Latent space of test set

In [None]:
lss, true = evaluate_latent_space(bert_classifier, val_dataloader)
df = pd.DataFrame(lss)
labels = np.array(["AfD", "B90", "FDP", "Linke", "SPD", "Union", "fraktionslos"])[true]
df["fraktion"] = labels
df.to_csv(f"latent_space_val_{get_curr_timestamp()}.csv", index=False)

### Latent space for all speeches

In [None]:
# training set
train_sampler_seq = SequentialSampler(train_data)
train_dataloader_seq = DataLoader(train_data, sampler=train_sampler_seq, batch_size=batch_size)

lss, true = evaluate_latent_space(bert_classifier, train_dataloader_seq)
df_train = pd.DataFrame(lss)
df_train["fraktion"] = np.array(["AfD", "B90", "FDP", "Linke", "SPD", "Union", "fraktionslos"])[true]
df_train["original_index"] = train_idxs 


In [None]:
# validation set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

lss, true = evaluate_latent_space(bert_classifier, val_dataloader)
df_val = pd.DataFrame(lss)
df_val["fraktion"] = np.array(["AfD", "B90", "FDP", "Linke", "SPD", "Union", "fraktionslos"])[true]
df_val["original_index"] = val_idxs 


In [None]:
# Write to csv
pd.concat([df_train, df_val], axis=0).sort_values("original_index").to_csv(f"latent_space_all_{get_curr_timestamp()}.csv", index=False)

## Test out the model

In [None]:
import matplotlib.pyplot as plt
def predict(x):
    if isinstance(x, str):
        x = [x]
    if isinstance(x, np.ndarray):
        x = list(map(str, x))
    inp = tokenizer(
                text=x,  # Preprocess sentence
                add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
                max_length=512,             # Max length to truncate/pad
                pad_to_max_length=True,         # Pad sentence to max length
                #return_tensors='pt',           # Return PyTorch tensor
                return_attention_mask=True      # Return attention mask
                )
    input_ids = torch.tensor(inp["input_ids"]).to(device)
    mask = torch.tensor(inp["attention_mask"]).to(device)
    bert_classifier.eval()
    with torch.no_grad():
        return torch.softmax(bert_classifier(input_ids, mask), 1).cpu().detach().numpy()

def classify_sentence(sentence):
    dist = np.squeeze(predict(sentence))
    plt.bar(["AfD", "B90", "FDP", "Linke", "SPD", "Union", "fraktionslos"], dist)


In [None]:
classify_sentence("Wir brauchen ein generelles Tempolimit auf den Autobahnen")

## Shapley value

In [None]:
import shap
def predict_one(x):
    dist = predict(x)
    return dist[:, 0] # extract AfD probabilities
explainer = shap.Explainer(predict_one, tokenizer)


In [None]:
shap_values = explainer(["Wir brauchen ein generelles Tempolimit auf den deutschen Autobahnen."], fixed_context=1)

shap.plots.text(shap_values[0])

In [None]:
shap.plots.bar(shap_values[0])
plt.savefig("autobahn_afd.png")