In [1]:
import os
os.environ["KERAS_BACKEND"] = "torch"
# import sys
# sys.path.append(r".\src\utils")
import keras
from keras import layers
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import utils.model as model
from utils.transcript import *
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## Data

In [2]:
folder_path = r'C:\Users\frbre\OneDrive\01 Dokumenter\01 Uni\SDS Thesis\data\Word_test'

In [3]:
# Load data
df = load_data_from_folder(folder_path)
df

Unnamed: 0,sentence,scale_1,scale_2,scale_3,scale_4,scale_5
0,"Lorem ipsum dolor sit amet, consectetur adipis...",0,1,0,0,0
1,Nunc a lectus eget justo pretium ultricies. Do...,1,0,0,0,0
2,Cras eget ullamcorper lacus. Donec a vestibulu...,0,0,1,0,0
3,In a euismod nunc. Phasellus commodo iaculis e...,0,0,0,1,0
4,Donec viverra metus ac orci aliquet pellentesq...,0,0,1,0,0
...,...,...,...,...,...,...
479,Trial morning some lead moment. Analysis sourc...,1,0,0,0,0
480,When ok focus write. Can science unit imagine....,1,0,0,0,0
481,Believe training box daughter son. Safe defens...,1,0,0,0,0
482,Hear keep husband alone under professional ass...,1,0,0,0,0


In [5]:
df.iloc[:,1:].sum().sum()

473

In [6]:
code_array = df.iloc[:, 1:].to_numpy()
sentences = df["sentence"].to_list()

In [7]:
code_array

array([[0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

In [10]:
code_array.shape

(484, 5)

In [9]:
code_array.sum()

473

## Tokenizer

In [42]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', max_length=512, padding=True, truncation=True)

In [43]:
# Tokenize texts and map the tokens to their word IDs.
input_ids = []

for sent in sentences:
    encoded_text = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    input_ids.append(encoded_text)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam vehicula tellus ut sem rhoncus sodales. Mauris porta ultricies ligula, sit amet placerat diam tristique nec. Nullam id orci efficitur justo fringilla malesuada in eu tortor. Maecenas lectus sem, porta in sapien vel, finibus dictum tellus. Pellentesque aliquam elit in tellus efficitur rhoncus. Integer id lacinia nisi, non elementum quam. Proin eros nunc, aliquet eget blandit in, efficitur et lacus. Mauris egestas ultrices lacus sit amet consectetur. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc at diam quis nulla rhoncus aliquet. Suspendisse in elit non nibh porttitor gravida nec eget velit. Etiam rutrum bibendum nulla, vel pellentesque sem. Vivamus aliquet vitae ipsum ut auctor. Vestibulum bibendum condimentum turpis sit amet laoreet. Nunc fermentum blandit sapien, sed pulvinar lorem laoreet id. Suspendisse vitae sagittis dolor, a viverra turpis. 
Token IDs: [101, 91473, 10451, 177, 13221, 10465,

In [44]:
# Check max sentence length
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  326


In [45]:
# # Pad our input tokens
input_ids = keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=512, dtype="long", truncating="post", padding="post", value=0)

# Create attention masks
attention_masks = []

for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [46]:
# Make train/val split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, label_array, 
                                                            random_state=2018, test_size=0.1)
# Performing same steps on the attention masks
train_masks, validation_masks, _, _ = train_test_split(attention_masks, label_array,
                                             random_state=2018, test_size=0.1)



In [47]:
# Convert to tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create dataloaders
batch_size = 2

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## Keras model

In [48]:
# Keras BERT model
keras_model = model.BERTKeras(num_classes=5, hidden_size=768, dropout_prob=0.25)

In [49]:
keras_model.summary()

In [50]:
# Test run
result = keras_model(train_inputs)

In [51]:
result.shape

torch.Size([13, 512, 5])

In [52]:
# Layers are 'unbuilt' until they are called
# After calling the model, it looks like this
keras_model.summary()

## Torch model

In [53]:
torch_model = model.BERTTorch(num_classes=5, hidden_size=768, dropout_prob=0.25)

In [54]:
print(torch_model)

BERTTorch(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [55]:
torch_result = torch_model(train_inputs)

In [56]:
torch_result.shape

torch.Size([13, 512, 5])