## Imports

In [1]:
import torch

from pytorch_transformers import BertTokenizer, BertModel

import pandas as pd

import numpy as np

from tqdm import tqdm

## Loading Tweets

In [2]:
data = pd.read_csv('data/train.csv')

dataset = data[['text', 'target']]

print(dataset.head())

                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1
2  All residents asked to 'shelter in place' are ...       1
3  13,000 people receive #wildfires evacuation or...       1
4  Just got sent this photo from Ruby #Alaska as ...       1


## Loading BERT and Tokenizer

In [3]:
tokenizer: BertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

## Tokenizing Each Sentence

In [32]:
question = "Is a disaster happening?"
tokenized_question = tokenizer.encode(question, add_special_tokens=True)
tokenized = dataset['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

print(tokenized.head())

0    [101, 2256, 15616, 2024, 1996, 3114, 1997, 202...
1    [101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...
2    [101, 2035, 3901, 2356, 2000, 1005, 7713, 1999...
3    [101, 2410, 1010, 2199, 2111, 4374, 1001, 3748...
4    [101, 2074, 2288, 2741, 2023, 6302, 2013, 1009...
Name: text, dtype: object


In [33]:
test = tokenized[0]

print(tokenizer.convert_ids_to_tokens(test))

['[CLS]', 'our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all', '[SEP]']


## Running tokens through bert

In [34]:
model.cuda()
model.eval()


pooled = np.zeros((len(tokenized), 768))
with torch.no_grad():
    for i,seq in enumerate(tqdm(tokenized, total=len(tokenized))):
        #info_embed = torch.ones(1, len(seq)).long().cuda()
        #info_embed[0, :len(tokenized_question)+2] = 0
        prepped_input = torch.tensor(seq).unsqueeze(0).cuda()
        out = model(prepped_input)
        pooled[i] = (out[1].cpu().detach().numpy())

  7%|▋         | 501/7613 [00:05<01:22, 85.85it/s]


KeyboardInterrupt: 

## Preparing saving of data with labels and text

In [31]:
new_data = pd.DataFrame(pooled)

new_data['target'] = dataset['target']
new_data['text'] = dataset['text']

print(new_data.head())

new_data.to_csv('data/processed_bert.csv', index=False)


          0         1         2         3         4         5         6  \
0 -0.988797 -0.892320 -0.999791  0.988228  0.962171 -0.736922  0.991268   
1 -0.885961 -0.262572 -0.776960  0.680914  0.348085 -0.260586  0.694603   
2 -0.962655 -0.742958 -0.997527  0.950694  0.933463 -0.574012  0.972405   
3 -0.988993 -0.807187 -0.998936  0.981410  0.918217 -0.686523  0.989779   
4 -0.952788 -0.628391 -0.906801  0.873680  0.662133 -0.338079  0.936946   

          7         8         9  ...       760       761       762       763  \
0  0.809517 -0.998878 -1.000000  ...  0.999906  0.938417 -0.740743 -0.813820   
1  0.424739 -0.245139 -0.999953  ...  0.750372  0.690344 -0.544250  0.670330   
2  0.651304 -0.991244 -1.000000  ...  0.998723  0.850846 -0.517664 -0.670394   
3  0.723451 -0.992681 -1.000000  ...  0.999650  0.949199 -0.817492 -0.625805   
4  0.438091 -0.823240 -0.999997  ...  0.972636  0.897335  0.041144  0.181147   

        764       765       766       767  target  \
0  0.895802 -0.