In [1]:
import pdb
import os
import re
import collections
from tqdm import tqdm
from pathlib import Path 
import math
%matplotlib widget

In [2]:
def read_and_concatenate_txt_files(main_folder):
    # Initialize an empty string to store concatenated content
    concatenated_content = ""

    # Collect all txt file paths
    txt_files = []
    for root, _, files in os.walk(main_folder):
        for file in files:
            if file.endswith('.txt'):
                txt_files.append(os.path.join(root, file))

    # Read and concatenate contents with a progress bar
    for file_path in tqdm(txt_files, desc="Reading .txt files"):
        with open(file_path, 'r', encoding='utf-8') as f:
            concatenated_content += f.read() + "\n"  # Add a newline after each file's content

    return concatenated_content

In [3]:
def clean_sentences(sentences):
    clean_sentences = []
    for st in sentences:
        tokens = st.translate(str.maketrans('', '', string.punctuation)).split()
        tokens = [word.lower() for word in tokens if word.isalpha()]
        if tokens:
            clean_sentences.append(tokens)
    return clean_sentences

In [4]:
def split_into_sentences(text):
    # Regular expression to match sentence-ending punctuation
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!|\n)\s')
    sentences = sentence_endings.split(text)
    return sentences

In [5]:
# Vocabulary building function
def build_vocab(texts, vocab_size=30522):
    token_counts = collections.Counter()
    for text in texts:
        tokens = re.findall(r'\w+|[^\w\s]', text.lower())
        token_counts.update(tokens)
    
    vocab = {}
    most_common_tokens = token_counts.most_common(vocab_size - 4)
    for i, (token, count) in enumerate(most_common_tokens, 4): # Start from 4 to reserve special tokens
        vocab[token] = i
    vocab['[PAD]'] = 0
    vocab['[UNK]'] = 1
    vocab['[CLS]'] = 2
    vocab['[SEP]'] = 3

    inv_vocab = {v: k for k, v in vocab.items()}
    return vocab, inv_vocab


In [11]:
def tokenize(text, vocab):
    tokens = re.findall(r'\w+|[^\w\s]', text.lower())
    wordpiece_tokens = []
    for token in tokens:
        wordpiece_tokens.extend(wordpiece_tokenize(token, vocab))
    return wordpiece_tokens

In [12]:
def split_into_sentences(text):
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!|\n)\s')
    sentences = sentence_endings.split(text)
    return sentences

In [13]:
def encode(text, vocab, max_length=128):
    tokens = tokenize(text, vocab)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    token_ids = [vocab.get(token, vocab['[UNK]']) for token in tokens]
    if len(token_ids) < max_length:
        token_ids += [vocab['[PAD]']] * (max_length - len(token_ids))
    else:
        token_ids = token_ids[:max_length]
    return token_ids

In [17]:
# Decoding function
def decode(token_ids, inv_vocab):
    tokens = [inv_vocab.get(token_id, '[UNK]') for token_id in token_ids]
    return ' '.join(tokens)

In [19]:
def wordpiece_tokenize(word, vocab):
    if word in vocab:
        return [word]
    tokens = []
    for i in range(len(word)):
        subword = word[:len(word) - i]
        if subword in vocab:
            tokens.append(subword)
            remainder = word[len(word) - i:]
            if remainder:
                tokens.extend(wordpiece_tokenize(remainder, vocab))
            break
    if not tokens:
        tokens = ['[UNK]']
    return tokens

## Reading an preparating data

In [22]:
main_folder = 'Data'

In [24]:
text = read_and_concatenate_txt_files(main_folder)

Reading .txt files: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it]


In [25]:
text[0:100]

'\nApril\n\nApril (Apr.) is the fourth month of the year in the Julian and Gregorian calendars, and come'

In [26]:
sentences = split_into_sentences(text)

In [27]:
sentences[:10]

['\nApril\n',
 'April (Apr.) is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May.',
 'It is one of four months to have 30 days.',
 '',
 'April always begins on the same day of the week as July, and additionally, January in leap years.',
 'April always ends on the same day of the week as December.',
 '',
 'April comes between March and May, making it the fourth month of the year.',
 'It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.',
 '']

## Creating Vocabulary

Vocab is a dictionary were each word is asociated to a numerical index and viceverza

In [31]:
vocab, inv_vocab = build_vocab(sentences)

### Clarification of BERT style enconding

The embedding layer in a neural network, including in the BERT model, does not interpret the numerical values of the token IDs as indicating any inherent ordering or value. Instead, it treats each unique token ID as an index into the embedding matrix, which is a learned parameter of the model.
#### How embedding layer works:

Token ID as indices: 
1. Each token ID is used as an indext to look up the corresponding row in the embeddings matrix.
2. The embedding matrix has a shape of **(vocab_size, hidden_size)**, where each row represents the learned embedding of specific token. 
3. The numerical value of the token ID does not influence th embedding itself; it is merely a pinter to a specifc row in the matrix.

Learning Embeddings:

1. The embedding matrix is initialized randomly (or with some pre-defined initialization strategy) at the beginning of training.
2. During training, the embeddings are updated based on the backpropagation of the loss.
3. The position of a token ID in the embedding matrix (whether it is higher or lower) does not imply any hierarchical value.

## Encoding the sentences

In [34]:
encoded_sentences = [encode(sentence, vocab) for sentence in sentences]

In [35]:
decoded_sentences = [decode(encoded_sentence, inv_vocab) for encoded_sentence in encoded_sentences]

# BERT Model:
## Embedding layer

In [37]:
import torch
import torch.nn as nn

In [38]:
class BertEmbeddings(nn.Module):
    def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size):
        
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids, token_type_ids):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        
        word_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        
        embeddings = word_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
    

 #### Conde Explanation       
1. __init__ is the constructor method that initializes the BertEmbeddings object.
2. super(BertEmbeddings, self).__init__() calls the constructor of the parent class (nn.Module).
3. self.word_embeddings: An embedding layer for token IDs. vocab_size is the number of unique tokens, and hidden_size is the size of each embedding vector.
4. self.position_embeddings: An embedding layer for position IDs. max_position_embeddings is the maximum sequence length, and hidden_size is the size of each embedding vector.
5. self.token_type_embeddings: An embedding layer for token type IDs. type_vocab_size is the number of token types (e.g., 2 for distinguishing between two sentences), and hidden_size is the size of each embedding vector.
6. self.layer_norm: A layer normalization layer that normalizes the embeddings.
8. self.dropout: A dropout layer that applies dropout regularization with a dropout rate of 0.1.

### Example of use for BERT type embedding

To run the BERT type embedding we need the next parameters:
1. vocab_size: The size of the vocabulary, which is the number of unique tokens in the vocabulary. It is set to len(vocab), where vocab is the dictionary mapping tokens to their corresponding IDs.
2. hidden_size: The size of the hidden layers in the BERT model. For BERT base models, this is typically 768.
3. max_position_embeddings: The maximum sequence length that the model can handle. For BERT base models, this is usually 512.
4. type_vocab_size: The number of different token types. BERT uses token type embeddings to distinguish between different segments (e.g., sentences). The value 2 is used because BERT distinguishes between two segments (segment A and segment B).

In [42]:
vocab_size = len(vocab)
hidden_size = 768
max_position_embeddings = 512
type_vocab_size = 2

The next line creates an instance of the BertEmbeddings class with the specified parameters. This class includes word embeddings, position embeddings, and token type embeddings, as well as layer normalization and dropout layers.

In [44]:
model = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings, type_vocab_size)

Model use **'encoded_senteces'** that is the list where each inner list contains the token IDs for a sentences. Also **'torch.tensor(encoded_sentences)'** is converting the list fo encoded sentences into PyTorch tensor, which can be used as input to the model 

Then, next line use **encoded_sentences** as list of lsit where each inner list contains the tokens IDs for the sentences and the delcaration **torch.tensor(enconded_senteces)** converts the list of encoded sentences into a PyTorch tensor, with ca be used as input to the model.

The toke_type_ids = torch.zeros_like(input_ids) use **torch.zeros_like(input_ids)** to create a tensor of zeros with the shape as **input_ids** 

In [47]:
#input_ids = torch.tensor(encoded_sentences)

In [48]:
#token_type_ids = torch.zeros_like(input_ids)

In [49]:
#BERT_embeddings = model(input_ids, token_type_ids)

In [68]:
batch_size = 400
tensor_embeddings_list = []

In [64]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [66]:
model.to(device)

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
for i in tqdm(range(0, len(encoded_sentences), batch_size), desc="Processing batches", unit="batch"):
    batch_encoded_sentences = encoded_sentences[i:i + batch_size]
    input_ids = torch.tensor(batch_encoded_sentences)
    token_type_ids = torch.zeros_like(input_ids)  # Assume all tokens belong to the same segment

    # Get embeddings
    embeddings = model(input_ids, token_type_ids)
    tensor_embeddings_list.append(embeddings)
    #print(f"Batch {i // batch_size + 1} Embeddings shape:", embeddings.shape)
    #print(f"Batch {i // batch_size + 1} Embeddings:", embeddings)

Processing batches:   0%|          | 5/7455 [00:05<2:34:41,  1.25s/batch]

In [None]:
final_embeddings = torch.cat(tensor_embeddings_list, dim=0)