In [17]:
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset

The dataset was generated by using ChatGPT

In [18]:
train_df = pd.read_csv("mountains_labeled.csv")

In [19]:
train_df.head(20)

Unnamed: 0,word,Tag,sentence_num
0,I,O,1
1,have,O,1
2,visited,O,1
3,Mount,B-mount,1
4,Fuji,I-mount,1
5,recently.,O,1
6,Mount,B-mount,2
7,Kosciuszko,I-mount,2
8,and,O,2
9,Mount,B-mount,2


In [20]:
train_df.sentence_num.unique().shape #as we can see, there are 500 sentences in the dataset

(500,)

In [21]:
train_df['sentence'] = train_df[['word','Tag', 'sentence_num']].groupby(['sentence_num'])['word'].transform(lambda x: ' '.join(x)) #grouping data to form sentences
train_df['word_tags'] = train_df[['sentence_num','word','Tag']].groupby(['sentence_num'])['Tag'].transform(lambda x: ','.join(x))
train_df.head()

Unnamed: 0,word,Tag,sentence_num,sentence,word_tags
0,I,O,1,I have visited Mount Fuji recently.,"O,O,O,B-mount,I-mount,O"
1,have,O,1,I have visited Mount Fuji recently.,"O,O,O,B-mount,I-mount,O"
2,visited,O,1,I have visited Mount Fuji recently.,"O,O,O,B-mount,I-mount,O"
3,Mount,B-mount,1,I have visited Mount Fuji recently.,"O,O,O,B-mount,I-mount,O"
4,Fuji,I-mount,1,I have visited Mount Fuji recently.,"O,O,O,B-mount,I-mount,O"


In [22]:
label2id = {k: v for v, k in enumerate(train_df.Tag.unique())} #tags to indices (we will need this dictionary as model works with indices, not with tags)
id2label = {v: k for v, k in enumerate(train_df.Tag.unique())} #indices to tags
label2id

{'O': 0, 'B-mount': 1, 'I-mount': 2}

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
max_length=128 #the maximum number of labels (tokens) the tokenizer will output

Bert relies on wordpiece tokenization. It means that we need to define labels at the wordpiece-level, rather than word-level.

In [25]:
def tokenization(sentence, text_labels, tokenizer):

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

Next, we define a regular PyTorch dataset class (which transforms examples of a dataframe to PyTorch tensors). Here, each sentence gets tokenized, the special tokens that BERT expects are added, the tokens are padded or truncated based on the max length of the model, the attention mask is created and the labels are created based on the dictionary which we defined above

In [26]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_tags = self.data.word_tags[index]
        tokenized_sentence, labels = tokenization(sentence, word_tags, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'input_ids': torch.tensor(ids, dtype=torch.long),
              'attention_mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'labels': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [27]:
training_set = dataset(train_df.reset_index(drop=True), tokenizer, max_length)#creating training set

In [28]:
training_set[0] #structure is appropriate

{'input_ids': tensor([  101,  1045,  2031,  4716,  4057, 20933,  3728,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [29]:
torch.save(dataset, 'training_set.pth') #saving PyTorch tensors