# Tryout bert embedding

In [8]:
import torch.nn as nn
import torch
# import os
from pathlib import Path

In [9]:
# combine both negative and pos in train and test to train
dataSet = {}
dataSet['train'] = []
dataSet['test'] = []
for p in Path('data/text_classification/train/pos/').glob('*.txt'): # 1 is pos and 0 is neg
    entry = p.read_text(encoding='utf8')
    dataSet['train'].append((entry, 1))
for p in Path('data/text_classification/train/neg/').glob('*.txt'): # 1 is pos and 0 is neg
    entry = p.read_text(encoding='utf8')
    dataSet['train'].append((entry, 0))

for p in Path('data/text_classification/test/pos/').glob('*.txt'): # 1 is pos and 0 is neg
    entry = p.read_text(encoding='utf8')
    dataSet['test'].append((entry, 1))
for p in Path('data/text_classification/test/neg/').glob('*.txt'): # 1 is pos and 0 is neg
    entry = p.read_text(encoding='utf8')
    dataSet['test'].append((entry, 0))


print(dataSet['train'][0])

('Greatly enjoyed this 1945 mystery thriller film about a young woman, Nina Foch,(Julia Ross) who is out of work and has fallen behind in her rent and is desperate to find work. Julia reads an ad in the local London newspaper looking for a secretary and rushes out to try and obtain this position. Julia obtains the position and is hired by a Mrs. Hughes, (Dame May Witty) who requires that she lives with her employer in her home and wants her to have no involvement with men friends and Julia tells them she has no family and is free to devote her entire time to this job. George Macready, (Ralph Hughes) is the son of Mrs. Hughes and has some very strange desires for playing around with knives. This was a low budget film and most of the scenes were close ups in order to avoid the expense of a background and costs for scenery. This strange family all live in a huge mansion off the Cornwall Coast of England and there is secret doors and plenty of suspense.', 1)


In [10]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

In [11]:
# with bert we don't need to split the sentences. But do remove the html
cleanedTrainData = []
CLEANHTML = re.compile('<.*?>')

for entry in dataSet['train']:
    # ! Hope the data doesn't contain heavy html tags or else it wouldn't work
    text = re.sub(CLEANHTML, '', entry[0])
    cleanedTrainData.append((text, entry[1]))

In [12]:
# try print out a cleaned text
for i in range(0, 3):
    print(cleanedTrainData[i][0])

Greatly enjoyed this 1945 mystery thriller film about a young woman, Nina Foch,(Julia Ross) who is out of work and has fallen behind in her rent and is desperate to find work. Julia reads an ad in the local London newspaper looking for a secretary and rushes out to try and obtain this position. Julia obtains the position and is hired by a Mrs. Hughes, (Dame May Witty) who requires that she lives with her employer in her home and wants her to have no involvement with men friends and Julia tells them she has no family and is free to devote her entire time to this job. George Macready, (Ralph Hughes) is the son of Mrs. Hughes and has some very strange desires for playing around with knives. This was a low budget film and most of the scenes were close ups in order to avoid the expense of a background and costs for scenery. This strange family all live in a huge mansion off the Cornwall Coast of England and there is secret doors and plenty of suspense.
When this movie first came out back in

In [13]:
from transformers import BertTokenizer, BertModel

In [14]:
# from transformers import AutoTokenizer

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

['using', 'a', 'transform', '##er', 'network', 'is', 'simple']
[2478, 1037, 10938, 2121, 2897, 2003, 3722]


In [16]:
# Add special required formatting for BERT
# maxSentenceLen = 0
# for text, label in cleanedTrainData:
#     input_ids = tokenizer.encode(text, add_special_tokens=True)
# 
#     maxSentenceLen = max(maxSentenceLen, len(input_ids))
# 
# print(f'The max sentences length is: {maxSentenceLen}')

# ! BERT only take 512 max

In [17]:
# test truncation methods [cut at the end -> shown to give better result]
# ? https://stackoverflow.com/questions/58636587/how-to-use-bert-for-long-text-classification?noredirect=1&lq=1
# ? https://github.com/huggingface/transformers/issues/4476#issuecomment-951445067
def encode_right_truncated(tokenizer, text, padding='max_length', max_length=512, add_special_tokens=True):
    out = tokenizer(text, padding=padding, max_length=max_length, add_special_tokens=add_special_tokens)
    tokenized = out['input_ids']
    print(f"Length of the inputID: {len(tokenized)}, length of token type: {len(out['token_type_ids'])}, length of mask: {len(out['attention_mask'])}")
    print(f"type: {out['token_type_ids']}")
    print(f"type: {out['attention_mask']}")
    if not add_special_tokens:
        truncated = tokenized[-max_length:]
    else:
        truncated = tokenized[0:1] + tokenized[-(max_length-1):]    # keep special start and end symbol
    
    return truncated

In [18]:
# Keeping only 512 word at the end
count = 0

for text, label in cleanedTrainData:
    input_ids = encode_right_truncated(tokenizer, text)
    print(input_ids)
    break
# print(f'The max sentences length is: {maxSentenceLen}')

Length of the inputID: 512, length of token type: 512, length of mask: 512
type: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# Create input tensor

In [19]:
# unpacking
wow = {
    'a': 1,
    'b': 20
}

def func (a, b):
    print(a, b)

# func(**{'a': 1, 'b': 2}, **{'c':4, 'd':5})

In [20]:
# LOOKING GOOD, now we can reuse the previous loop to create the input tensor

def encode_right_truncated(tokenizer, text, padding='max_length', max_length=512, add_special_tokens=True):
    out = tokenizer(text, padding=padding, max_length=max_length, add_special_tokens=add_special_tokens)
    tokenized = out['input_ids']
    tokenType = out['token_type_ids']
    attention = out['attention_mask']

    if not add_special_tokens:
        truncated = {
            'input_ids': tokenized[-max_length:],
            'token_type_ids': tokenType[-max_length:],
            'attention_mask': attention[-max_length:],
        }
    else:
        truncated = {
            'input_ids': tokenized[0:1] + tokenized[-(max_length-1):],    # keep special start and end symbol
            'token_type_ids': tokenType[0:1] + tokenType[-(max_length-1):],
            'attention_mask': attention[0:1] + attention[-(max_length-1):],
        }
    
    return truncated

In [21]:
from torch.utils.data import TensorDataset

testText = cleanedTrainData[0][0]
test = encode_right_truncated(tokenizer, testText)

inputIDTensor = torch.tensor(test['input_ids'])
attentionTensor = torch.tensor(test['attention_mask'])
labelTensor = torch.tensor(cleanedTrainData[0][1])

print(f'shape of inputID: {inputIDTensor.shape} and shape of attention: {attentionTensor.shape} and label shape: {labelTensor.shape}')

shape of inputID: torch.Size([512]) and shape of attention: torch.Size([512]) and label shape: torch.Size([])


Input of RNN should be tensor of (batch size, max length for the whole text, dimesion of the vocab vector [the size of the vector use to represent the vocab]) 
### in Bert, the shape is (batch, 512, 768) -> 768 is the size of each vocab vector in BERT

# Testing out putting test_data into Tensor

In [22]:
processedData = {
    'train': [],
    'test': []
}

# * 1 is for positive, 0 is negative
print('loading data....')
dataPath = 'data/test_data'

for p in Path(dataPath, 'train', 'pos').glob('*.txt'):  # 1 is pos and 0 is neg
    entry = p.read_text(encoding='utf8')
    processedData['train'].append((entry, 1))
for p in Path(dataPath, 'train', 'neg').glob('*.txt'):  # 1 is pos and 0 is neg
    entry = p.read_text(encoding='utf8')
    processedData['train'].append((entry, 0))

for p in Path(dataPath, 'test', 'pos').glob('*.txt'):  # 1 is pos and 0 is neg
    entry = p.read_text(encoding='utf8')
    processedData['test'].append((entry, 1))
for p in Path(dataPath, 'test', 'neg').glob('*.txt'):  # 1 is pos and 0 is neg
    entry = p.read_text(encoding='utf8')
    processedData['test'].append((entry, 0))

# Clean the training data
cleanedData = []
CLEANHTML = re.compile('<.*?>')

for entry in processedData['train']:
    # ! Hope the data doesn't contain heavy html tags or else it wouldn't work
    text = re.sub(CLEANHTML, '', entry[0])
    cleanedData.append((text, entry[1]))


processedData['train'] = cleanedData
print('Some output: ')
print(processedData['train'][0:2])

loading data....
Some output: 
[('"All the world\'s a stage and its people actors in it"--or something like that. Who the hell said that theatre stopped at the orchestra pit--or even at the theatre door? Why is not the audience participants in the theatrical experience, including the story itself?This film was a grand experiment that said: "Hey! the story is you and it needs more than your attention, it needs your active participation". "Sometimes we bring the story to you, sometimes you have to go to the story."Alas no one listened, but that does not mean it should not have been said.', 1), ("FUTZ is the only show preserved from the experimental theatre movement in New York in the 1960s (the origins of Off Off Broadway). Though it's not for everyone, it is a genuinely brilliant, darkly funny, even more often deeply disturbing tale about love, sex, personal liberty, and revenge, a serious morality tale even more relevant now in a time when Congress wants to outlaw gay marriage by trash

# BERT Embedding is just one step to get the embedded vector before putting into the model
Bert vector already been trained to have word related to each other to be close in their space

In [23]:
# Testing out converting data into tensor shape
inputData = {
    'train': {
        'X': [],
        'y': []
    },
    'test': {
        'X': [],
        'y': []
    }
}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BERT_MAX_LENGTH = 512

for i, (text, label) in enumerate(processedData['train']):
    out = tokenizer(text, padding='max_length',
                    max_length=BERT_MAX_LENGTH, add_special_tokens=True)
    tokenized = out['input_ids']
    tokenType = out['token_type_ids']
    attention = out['attention_mask']
    truncated = {
        # keep special start and end symbol
        'input_ids': tokenized[0:1] + tokenized[-(BERT_MAX_LENGTH-1):],
        'token_type_ids': tokenType[0:1] + tokenType[-(BERT_MAX_LENGTH-1):],
        'attention_mask': attention[0:1] + attention[-(BERT_MAX_LENGTH-1):],
    }
    inputData['train']['X'].append(truncated)
    inputData['train']['y'].append(label)

for i, (text, label) in enumerate(processedData['test']):
    out = tokenizer(text, padding='max_length',
                    max_length=BERT_MAX_LENGTH, add_special_tokens=True)
    tokenized = out['input_ids']
    tokenType = out['token_type_ids']
    attention = out['attention_mask']
    truncated = {
        # keep special start and end symbol
        'input_ids': tokenized[0:1] + tokenized[-(BERT_MAX_LENGTH-1):],
        'token_type_ids': tokenType[0:1] + tokenType[-(BERT_MAX_LENGTH-1):],
        'attention_mask': attention[0:1] + attention[-(BERT_MAX_LENGTH-1):],
    }
    inputData['test']['X'].append(truncated)
    inputData['test']['y'].append(label)


In [24]:
print(f"Test data len: {len(inputData['test']['X'])}, test label len: {len(inputData['test']['y'])}")

Test data len: 10, test label len: 10


In [25]:
# BERT is trained and expect sentence pairs, so we need to number each tensor to belong to a text
segments_ids = [[x] * 512 for x in range(0, len(inputData['train']['X']))]
segments_tensor = torch.tensor(segments_ids)
print(segments_tensor.shape)
segments_tensor

torch.Size([10, 512])


tensor([[0, 0, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [2, 2, 2,  ..., 2, 2, 2],
        ...,
        [7, 7, 7,  ..., 7, 7, 7],
        [8, 8, 8,  ..., 8, 8, 8],
        [9, 9, 9,  ..., 9, 9, 9]])

In [26]:
######## Extracting embeddings

# Convert inputs to pytorch tensor
tokens_list = []
for each in inputData['train']['X']:
    tokens_list.append(each['input_ids'])
tokens_tensor = torch.tensor(tokens_list)

tokens_tensor.shape

torch.Size([10, 512])

In [27]:
# Load the BERT embedding model
bertModel = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
bertModel.eval()    # Only wants to use the bert model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [28]:
# Now get the Bert hidden state vector (high quality) from Bert output so we can use in our own model
with torch.no_grad():
    outputs = bertModel(tokens_tensor, segments_tensor)

    hidden_states = outputs[2]

In [29]:
hidden_states

(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
          [ 0.5467,  0.3301, -0.9227,  ...,  0.9150,  0.8351, -0.2478],
          [-0.8933,  0.4482, -0.2173,  ...,  0.8924,  0.5684, -0.6553],
          ...,
          [ 0.6458, -0.5409, -0.1780,  ..., -0.0482, -0.3466, -0.4825],
          [ 0.7417, -0.7271,  0.3278,  ..., -0.2011, -0.6038, -0.4935],
          [ 0.2992, -1.0338,  0.1294,  ...,  0.2149,  0.2113, -1.5097]],
 
         [[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
          [ 0.5402, -0.0159, -0.3557,  ..., -0.3442,  0.2522, -0.2073],
          [-1.7013, -0.3745, -0.0449,  ...,  0.8165,  1.0988,  0.6134],
          ...,
          [ 0.6458, -0.5409, -0.1780,  ..., -0.0482, -0.3466, -0.4825],
          [ 0.7417, -0.7271,  0.3278,  ..., -0.2011, -0.6038, -0.4935],
          [ 0.2992, -1.0338,  0.1294,  ...,  0.2149,  0.2113, -1.5097]],
 
         [[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
          [-0.3406,  0.7025,

In [30]:
len(hidden_states)

13

In [31]:
print(f'Number of layers: {len(hidden_states)}')
print(f'Number of batches: {len(hidden_states[0])}')
print(f'Number of tokens: {len(hidden_states[0][0])}')
print(f'Number of hidden units: {len(hidden_states[0][0][0])}')

Number of layers: 13
Number of batches: 10
Number of tokens: 512
Number of hidden units: 768


In [43]:
token_layers = torch.stack(hidden_states, dim=0)[-1]
token_layers.shape

torch.Size([10, 512, 768])

In [33]:
# # Since the dimension is in form [# layers, # batches, # tokens, # features]
# # we need it in form [#batches, # tokens, # layers, # features]
# token_embedding = token_layers.permute(1, 2, 0, 3)
# token_embedding.shape

torch.Size([10, 512, 13, 768])

In [46]:
# Cut the layer
layer = nn.RNN(input_size=768, hidden_size=512, batch_first=True)
out, _ = layer(token_layers)

print(out)

tensor([[[-0.1050,  0.1815,  0.2432,  ...,  0.5991,  0.0243, -0.7542],
         [-0.4050, -0.1798,  0.3960,  ...,  0.4193,  0.2160, -0.8387],
         [-0.3172, -0.1309,  0.5166,  ...,  0.2522,  0.2447, -0.8420],
         ...,
         [-0.3482, -0.1776,  0.4865,  ...,  0.4958,  0.3516, -0.8504],
         [-0.3394, -0.1610,  0.5153,  ...,  0.5064,  0.3704, -0.8461],
         [-0.2966, -0.1643,  0.5163,  ...,  0.4877,  0.3637, -0.8458]],

        [[-0.1877, -0.0602, -0.0828,  ...,  0.4321,  0.2986, -0.7475],
         [-0.4315,  0.3184,  0.5249,  ..., -0.6782, -0.5140, -0.4113],
         [-0.1051,  0.6852,  0.5233,  ..., -0.5658,  0.1412, -0.0932],
         ...,
         [-0.1745, -0.2217, -0.5206,  ...,  0.4689,  0.6779, -0.4577],
         [-0.1104, -0.1711, -0.3864,  ...,  0.4355,  0.6333, -0.4135],
         [-0.4502,  0.0389, -0.6640,  ...,  0.1800,  0.7179, -0.4928]],

        [[-0.1121,  0.2323,  0.1242,  ...,  0.4887, -0.0598, -0.6045],
         [-0.4099, -0.0529,  0.2986,  ...,  0