# Produce tweet sentence embeddings

In [1]:
import pandas as pd
import numpy as np
import torch

### If there's a GPU available use it, otherwise use the CPU

In [2]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


### Constants

In [3]:
N_ROWS = 100
BATCH_SIZE = 100
# PAD = int(0)
MODEL = 'bert-base-multilingual-cased'
MODEL_PATH = 'models/' + MODEL

### Paths

In [4]:
PATH = "tweet_tokens/text_tokens_padded_50k.csv"
RESULT_PATH = "embeddings/sentence_embeddings_50k_batch.csv"

### Read the tweets length after being padded

In [5]:
with open("max_tweet_length.txt", "r") as f:
    MAX_LENGTH = int(f.readline())

In [6]:
MAX_LENGTH

511

### BERT tokenizer using the 'bert-multilingual-cased' vocabulary

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(MODEL)

In [8]:
vocab_size = len(tokenizer.vocab.keys())

In [9]:
vocab_size

119547

In [10]:
tokenizer.vocab['[PAD]']

0

## Create the model from a pre-trained one

In [11]:
from transformers import BertModel

model = BertModel.from_pretrained(
    MODEL_PATH, # Use the 12-layer BERT model
)

In [12]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 199 different named parameters.

==== Embedding Layer ====

embeddings.word_embeddings.weight                       (119547, 768)
embeddings.position_embeddings.weight                     (512, 768)
embeddings.token_type_embeddings.weight                     (2, 768)
embeddings.LayerNorm.weight                                   (768,)
embeddings.LayerNorm.bias                                     (768,)

==== First Transformer ====

encoder.layer.0.attention.self.query.weight               (768, 768)
encoder.layer.0.attention.self.query.bias                     (768,)
encoder.layer.0.attention.self.key.weight                 (768, 768)
encoder.layer.0.attention.self.key.bias                       (768,)
encoder.layer.0.attention.self.value.weight               (768, 768)
encoder.layer.0.attention.self.value.bias                     (768,)
encoder.layer.0.attention.output.dense.weight             (768, 768)
encoder.layer.0.attention.output.dense.bias                   

In [13]:
# Tell pytorch to run this model on the GPU.
#model.cuda()

### Function used to save the embeddings to csv file

In [14]:
def save_embeddings(index, embs):

    for emb_list in embs:
      string = str(index) + ','

      for item in emb_list:
          string += str(item) + '\t'
          
      embeddings_file.write(string + '\n')

      index += 1

### Lambda used to convert a list of strings separated by '\t' into a numpy array of integers

In [15]:
f_to_int = lambda x: int(x)
f_int = lambda x: np.array(list(map(f_to_int, x.replace('\n','').split('\t'))))

### Open the needed files

In [16]:
embeddings_file = open(RESULT_PATH, "w+")

In [17]:
embeddings_file.write("tweet_features_tweet_id,sentence_embeddings\n")  # write the header of the embeddings file

44

In [18]:
tokens_file = open(PATH, "r")

In [19]:
# model.eval()  # is this needed ???

In [20]:
# torch.cuda.empty_cache()

### Load tweets in chunks, produce and save their sentence embeddings

In [21]:
%%time

# ~20 MINUTES EXECUTION ON THE 50k TWEETS

# ignore header
tokens_file.readline()


input_tokens = np.zeros(shape=(CHUNKSIZE, MAX_LENGTH), dtype=np.int64)
masks = np.zeros(shape=(CHUNKSIZE, MAX_LENGTH), dtype=np.int64)

finished = False
i = 0

while not finished and i < N_ROWS:

    print('chunk : ' + str(int(i/BATCH_SIZE)))

    # BUILD A BATCH
    j = 0
    while not finished and j < BATCH_SIZE:
        
        line = str(tokens_file.readline())
      
        if i+j >= N_ROWS or line == '':
            finished = True
        
        elif line != '':
            line = line.split(',')[1]
            input_tokens[j] = f_int(line)

            # create attention mask and convert both the mask
            #  and the input tokens to pytorch tensors
            masks[j] = np.array(input_tokens[j] > 0)
      
        j += 1
    

    # BUILD PYTORCH TENSORS FOR THE 
    # BATCH AND MOVE THEM TO GPU
    masks_tensor = torch.tensor(masks)
    input_tensor = torch.tensor(input_tokens)

    # move tensors to GPU
    input_tensor = input_tensor.to(device) 
    masks_tensor = masks_tensor.to(device) 

    # PROCESS THE BATCH
    outputs = model(input_ids=input_tensor, attention_mask=masks_tensor)

    embeddings = outputs[1].tolist()
    
    save_embeddings(i, embeddings)

    #del input_tensor
    #del masks_tensor
    #del embeddings
    #torch.cuda.empty_cache()

    i += BATCH_SIZE

chunk : 0
CPU times: user 5min 4s, sys: 1min 9s, total: 6min 14s
Wall time: 35.5 s


### Close files

In [22]:
tokens_file.close()

embeddings_file.close()