In [1]:
import os
os.environ["KERAS_BACKEND"] = "torch"
# import sys
# sys.path.append(r".\src\utils")
import keras
from keras import layers
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import utils.model as model
from utils.transcript import *
import pandas as pd
from sklearn.model_selection import train_test_split
from torchsummary import summary

  from .autonotebook import tqdm as notebook_tqdm


## Data

In [2]:
folder_path = r'C:\Users\frbre\OneDrive\01 Dokumenter\01 Uni\SDS Thesis\data\test'

In [3]:
file_path = r"C:\Users\frbre\OneDrive\01 Dokumenter\01 Uni\SDS Thesis\data\test\1102 JBOV Session 3.doc"

In [4]:
test = load_patient_turns(file_path)
test

['I’m ok. I’m feeling a little, ah, stressed because of everything coming up, um, but/\r',
 'um, yeah, I, uh, my personal statement is like second draft form, and then I’m working on it, and it’s, it’s almost done. my friend Zach read it, and I think he made it more difficult for me to write it.\r',
 'just by giving me constructive criticism and being like, be more specific, y’know, as if the specificity is important, like they want to know exactly what’s going to set you apart from the other people, or highlight this, or make this, and I’m like, ok, alright/\r',
 '(kind of laughing) yeah, but not that one, cause we already did that\r',
 'um, but then there’s preparing all those monologues for my audition.  I know which ones I’m doing and I’ve started working on them, um, and then I have an audition tomorrow, which I’m looking at, and then my class starts on Tuesday, for improv, and I sent an e-mail to those Shakespeare people that I thought I might audition with, and they’ve already c

In [5]:
len(test)

99

In [6]:
test_turns = load_patient_turns_from_folder(folder_path)
test_turns

[['I’m ok. I’m feeling a little, ah, stressed because of everything coming up, um, but/\r',
  'um, yeah, I, uh, my personal statement is like second draft form, and then I’m working on it, and it’s, it’s almost done. my friend Zach read it, and I think he made it more difficult for me to write it.\r',
  'just by giving me constructive criticism and being like, be more specific, y’know, as if the specificity is important, like they want to know exactly what’s going to set you apart from the other people, or highlight this, or make this, and I’m like, ok, alright/\r',
  '(kind of laughing) yeah, but not that one, cause we already did that\r',
  'um, but then there’s preparing all those monologues for my audition.  I know which ones I’m doing and I’ve started working on them, um, and then I have an audition tomorrow, which I’m looking at, and then my class starts on Tuesday, for improv, and I sent an e-mail to those Shakespeare people that I thought I might audition with, and they’ve alre

In [7]:
len(test_turns[0])

99

In [8]:
test_turns[3]

['I uh - forget where we uh - left off - - \r',
 'yeah I think uh - - I left  - - it seems like - when I come in I talk a lot about uh - my relationship with [name of girlfriend]] and uh - - I guess that’s been kind of - y’know uh - bothering me lately or - - concerning - and what we were - I believe talking about - was - y’know my - wanting to - maybe - uh - y’know - tie the knot with her - and uh - get married and have a family - which I think is uh stressful to uh - think about that at this point - and uh I wanted to - there’s a part of me - that wants to do it - and there’s a part of me that doesn’t - and - - the part of me is uh - - the part saying you know uh I guess I think - - part of my struggling with this is that I think your life changes when you get married - - um - you live with someone you uh you you know you make a commitment for life you have kids i think you have to think different you’re whole - going from single to um married I think is eomthign that uh is a challen

In [9]:
split_turns = split_into_chunks(test_turns)
split_turns

[['I’m ok. I’m feeling a little, ah, stressed because of everything coming up, um, but/ um, yeah, I, uh, my personal statement is like second draft form, and then I’m working on it, and it’s, it’s almost done. my friend Zach read it, and I think he made it more difficult for me to write it. just by giving me constructive criticism and being like, be more specific, y’know, as if the specificity is important, like they want to know exactly what’s going to set you apart from the other people, or highlight this, or make this, and I’m like,',
  'ok, alright/ (kind of laughing) yeah, but not that one, cause we already did that um, but then there’s preparing all those monologues for my audition. I know which ones I’m doing and I’ve started working on them, um, and then I have an audition tomorrow, which I’m looking at, and then my class starts on Tuesday, for improv, and I sent an e-mail to those Shakespeare people that I thought I might audition with, and they’ve already cast everything but 

In [10]:
len(split_turns)

4

In [11]:
len(split_turns[0])

41

In [12]:
all_turns = [item for sublist in split_turns for item in sublist]
all_turns

['I’m ok. I’m feeling a little, ah, stressed because of everything coming up, um, but/ um, yeah, I, uh, my personal statement is like second draft form, and then I’m working on it, and it’s, it’s almost done. my friend Zach read it, and I think he made it more difficult for me to write it. just by giving me constructive criticism and being like, be more specific, y’know, as if the specificity is important, like they want to know exactly what’s going to set you apart from the other people, or highlight this, or make this, and I’m like,',
 'ok, alright/ (kind of laughing) yeah, but not that one, cause we already did that um, but then there’s preparing all those monologues for my audition. I know which ones I’m doing and I’ve started working on them, um, and then I have an audition tomorrow, which I’m looking at, and then my class starts on Tuesday, for improv, and I sent an e-mail to those Shakespeare people that I thought I might audition with, and they’ve already cast everything but th

In [13]:
len(all_turns)

153

In [14]:
# Generate fake labels array
length = len(all_turns)

# Generate a fake labels array
fake_labels = np.eye(6)[np.random.choice(6, length)]

In [15]:
fake_labels.shape

(153, 6)

## Tokenizer

In [16]:
max_length = 512

In [17]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', max_length=max_length, padding=True, truncation=True)

In [18]:
# Tokenize texts and map the tokens to their word IDs.
input_ids = []

for sent in all_turns:
    encoded_text = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    input_ids.append(encoded_text)

# Print sentence 0, now as a list of IDs.
print('Original: ', all_turns[0])
print('Token IDs:', input_ids[0])

Original:  I’m ok. I’m feeling a little, ah, stressed because of everything coming up, um, but/ um, yeah, I, uh, my personal statement is like second draft form, and then I’m working on it, and it’s, it’s almost done. my friend Zach read it, and I think he made it more difficult for me to write it. just by giving me constructive criticism and being like, be more specific, y’know, as if the specificity is important, like they want to know exactly what’s going to set you apart from the other people, or highlight this, or make this, and I’m like,
Token IDs: [101, 146, 100, 181, 14302, 119, 146, 100, 181, 61362, 169, 16745, 117, 69863, 117, 39608, 10336, 12373, 10108, 42536, 23959, 10741, 117, 10293, 117, 10473, 120, 10293, 117, 11023, 12257, 117, 146, 117, 189, 10237, 117, 15127, 14927, 33311, 10124, 11850, 11132, 28447, 12188, 117, 10111, 11059, 146, 100, 181, 14616, 10135, 10271, 117, 10111, 10271, 100, 187, 117, 10271, 100, 187, 17122, 20378, 119, 15127, 20104, 81345, 24944, 10271, 117

In [19]:
# Check max sentence length
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  180


In [20]:
# # Pad our input tokens
input_ids = keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=max_length, dtype="long", truncating="post", padding="post", value=0)

# Create attention masks
attention_masks = []

for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [21]:
# Make train/val split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, fake_labels, 
                                                            random_state=2018, test_size=0.1)
# Performing same steps on the attention masks
train_masks, validation_masks, _, _ = train_test_split(attention_masks, fake_labels,
                                             random_state=2018, test_size=0.1)



In [22]:
# Convert to tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create dataloaders
batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## Keras model

In [23]:
# Keras BERT model
keras_model = model.BERTKeras(num_classes=6, hidden_size=768, dropout_prob=0.25)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'BertForSequenceClassification' object has no attribute 'layers'

In [78]:
keras_model.summary()

In [79]:
print(keras_model)

<BERTKeras name=bert_keras, built=False>


In [83]:
# Test run
result = keras_model(train_inputs)

TypeError: Exception encountered when calling TorchModuleWrapper.call().

[1mBertModel.forward() got an unexpected keyword argument 'attention_masks'[0m

Arguments received by TorchModuleWrapper.call():
  • args=('torch.Tensor(shape=torch.Size([137, 512]), dtype=int32)',)
  • kwargs={'attention_masks': 'None'}

In [27]:
result.shape

torch.Size([36, 256, 6])

In [28]:
# Layers are 'unbuilt' until they are called
# After calling the model, it looks like this
keras_model.summary()

## BERT direct model (torch)

In [23]:
# BERT model
bert_direct = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 6,   
    output_attentions = False, 
    output_hidden_states = False, )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
print(bert_direct)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [69]:
train_inputs[0].shape

torch.Size([512])

In [70]:
train_inputs[0]

tensor([   101,  10392,  12820,  18084,  10271,    118,    169,  11940,  10160,
           169,  10635,    146,  75980,  22094,    117,    146,  18957,  10978,
         10841,    146,  18957,  10978,  18084,  10105,  10172,  49183,  10271,
           100,    187,  11850,  11387,  11044,    146,  12820,  17367,  10114,
         38008,  18322,    117,  10111,    146,    100,    181,  19090,  10114,
         49619,    117,  10111,  13028,  21852,  10873,  10135,    146,  10944,
         27874,  10978,    120,    136,    136,  17367,  10230,  10105,  10172,
         49183,  13123,    117,    118,    118,    146,  12172,  20517,  15127,
         10172,  49183, 108361,  11357,  46791,    117,  10111,  15127,  26937,
         65390,    100,    188,  16683,  54214,  10978,  10189,    117,  10111,
         13028,  21852,    117,  10271,  19513,    169,  35723,    117,    146,
         10134,  36897,  12547,    117,  10271,  19513,  11152,  13028,  21852,
         11152,  22899,  35723,    117, 

In [71]:
bert_result = bert_direct(train_inputs)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


IndexError: index out of range in self

## Torch model

In [27]:
torch_model = model.BERTTorch(num_classes=6, hidden_size=768, dropout_prob=0.25)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
print(torch_model)

BERTTorch(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias

In [27]:
summary(torch_model, input_size=(512,))

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [29]:
torch_result = torch_model(train_inputs, attention_mask=train_masks)

In [30]:
torch_result.shape

torch.Size([137, 6])

In [31]:
torch_result

tensor([[-2.7646e-02, -2.6870e-02, -7.1476e-02, -1.0659e-02, -3.8957e-02,
         -1.0629e-02],
        [-2.9158e-02, -3.4776e-02, -6.7497e-02, -2.9946e-03, -5.5752e-02,
          4.5183e-03],
        [-1.9398e-02,  7.4220e-02, -2.2707e-02, -1.0409e-01,  8.0605e-03,
          4.1189e-03],
        [-1.8792e-02, -1.2668e-02, -3.6955e-02, -2.5059e-02, -3.1904e-02,
          6.8698e-03],
        [-3.1812e-02, -2.6017e-02, -7.0911e-02, -1.8237e-02, -5.4177e-02,
          4.1337e-03],
        [-3.4315e-02, -3.4276e-02, -7.7005e-02, -4.2269e-03, -5.5922e-02,
          1.1908e-03],
        [-2.4618e-02, -2.3792e-02, -6.9618e-02, -2.2924e-02, -6.4531e-02,
          7.3805e-03],
        [-5.2771e-03, -1.2209e-02, -2.2869e-02, -1.8103e-02, -2.9415e-02,
          6.8337e-03],
        [-4.3654e-02, -1.3179e-02, -7.4799e-02, -2.5338e-02, -4.6718e-02,
          1.4995e-03],
        [-2.9003e-02, -1.5181e-02, -8.3861e-02, -2.3866e-02, -5.4250e-02,
         -3.5599e-04],
        [-3.3104e-02, -2.7938e