# Trials with BERT

Our Objective is to **tokenize** the texts using BERT tokenizer in order to feed them to our Neural Network **DNN_BERT**.

In [1]:
import csv
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from tqdm import tqdm
import torch.nn as nn
from collections import OrderedDict
import matplotlib.pyplot as plt

In [2]:
# Imports from local scripts:
import sys
# insert at 1 the script path:
sys.path.insert(1, 'Scripts')
from Training_Accuracy_functions import accuracy, train_epoch

#### Using GPU

In [3]:
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
else:
    print('Using CPU.')
    device = torch.device("cpu")

There are 1 GPU(s) available.


### Importing tweets

In [4]:
# Load the training data
train_data = pd.read_csv("../Data/train.csv")

# Load the test data:
eval_data = pd.read_csv("../Data/evaluation.csv")

### Initializing Bert tokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


#### Example of Tokenization

In [6]:
text = "I love Ronan Sangouard"

# Print the original sentence.
print(' Original: ', text)

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(text))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)))

 Original:  I love Ronan Sangouard
Tokenized:  ['i', 'love', 'ronan', 'sang', '##ou', '##ard']
Token IDs:  [1045, 2293, 18633, 6369, 7140, 4232]


### Tokenize the data set:

In [7]:
def ComputeMaxlen(data, tokenizer):
    max_len = 0

    # For every sentence
    for index, row in tqdm(data.iterrows()):

        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_ids = tokenizer.encode(row['text'], add_special_tokens=True)

        # Update the maximum sentence length.
        max_len = max(max_len, len(input_ids))
    
    return max_len


In [8]:
print('Max sentence length: ', ComputeMaxlen(train_data, tokenizer))

665777it [04:20, 2551.39it/s]

Max sentence length:  425





In [9]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
def TokenizeData(data, tokenizer):
    input_ids = []
    attention_masks = []

    # For every sentence...
    for index, row in tqdm(data.iterrows()):
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            row['text'],                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            padding = 'max_length',
                            max_length = 512,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])
        
    return input_ids, attention_masks



In [10]:
input_ids = TokenizeData(train_data, tokenizer)[0]

665777it [05:33, 1995.32it/s]


In [11]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids).float().cuda()
#attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train_data['retweet_count'])

# Print sentence 32, now as a list of IDs.
print('Original: ', train_data['text'][32])
print('Token IDs:', input_ids[32])

Original:  Our new reality. Social distancing. #COVID19 #SocialDistance https://t.co/upR0Z7X0a2
Token IDs: tensor([  101.,  2256.,  2047.,  4507.,  1012.,  2591.,  4487., 12693.,  6129.,
         1012.,  1001.,  2522., 17258., 16147.,  1001.,  2591., 10521., 26897.,
        16770.,  1024.,  1013.,  1013.,  1056.,  1012.,  2522.,  1013.,  2039.,
         2099.,  2692.,  2480.,  2581.,  2595.,  2692.,  2050.,  2475.,   102.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0

#### On test data

In [12]:
print('Max sentence length: ', ComputeMaxlen(eval_data, tokenizer))

285334it [01:52, 2543.14it/s]

Max sentence length:  416





In [13]:
input_ids_test, _ = TokenizeData(eval_data, tokenizer)

285334it [02:24, 1981.14it/s]


In [14]:
# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids_test).float().cuda()


In [15]:
# Print sentence 32, now as a list of IDs.
print('Original: ', eval_data['text'][32])
print('Token IDs:', input_ids_test[32])

Original:  Also in this poll, only 52% said they wear masks inside public spaces 😷 not great
Token IDs: tensor([  101.,  2036.,  1999.,  2023.,  8554.,  1010.,  2069.,  4720.,  1003.,
         2056.,  2027.,  4929., 15806.,  2503.,  2270.,  7258.,   100.,  2025.,
         2307.,   102.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0., 

### Loading features

##### Training features

In [16]:
features_tr = pd.read_pickle("../Preprocessing/Data/train_processed.pkl")

In [17]:
### Adding them to tensor
features_arr = np.array(features_tr[['user_statuses_count', 'hashtag_count', 'user_mentions_count', 'user_followers_count', 'user_friends_count', 'user_verified', 'text_length', 'hour', 'week_day', 'day']])
input_ids[:,-11:-1] = torch.tensor(features_arr, device = 'cuda').float()

##### Testing features

In [18]:
features_test = pd.read_pickle("../Preprocessing/Data/eval_processed.pkl")

In [19]:
### Adding them to tensor
features_arr_test = np.array(features_test[['user_statuses_count', 'hashtag_count', 'user_mentions_count', 'user_followers_count', 'user_friends_count', 'user_verified', 'text_length', 'hour', 'week_day', 'day']])
input_ids_test[:,-11:-1] = torch.tensor(features_arr_test, device = 'cuda').float()

### Saving Training data:

##### Training Data

In [20]:
torch.save(input_ids, 'Tensors/Training/Tokens_10features.pt')

In [21]:
torch.save(labels, 'Tensors/Training/Retweets.pt')

##### Testing Data

In [22]:
torch.save(input_ids_test, 'Tensors/Testing/Tokens_10features.pt')

## Links:

1. For a conceptual understanding of bert click [here](https://towardsdatascience.com/bert-to-the-rescue-17671379687f).
2. You can also find an example [here](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)
3. Simple BERT Explanation [here](https://yashuseth.blog/2019/06/12/bert-explained-faqs-understand-bert-working/#:~:text=What%20is%20BERT%3F,task%2Dspecific%20fine%2Dtuning.)