## Imports

In [1]:
import torch

from pytorch_transformers import BertTokenizer, BertModel

import pandas as pd

import numpy as np

from tqdm import tqdm

## Loading Tweets

In [2]:
data = pd.read_csv('data/train.csv')

dataset = data[['text', 'target']]

print(dataset.head())

                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1
2  All residents asked to 'shelter in place' are ...       1
3  13,000 people receive #wildfires evacuation or...       1
4  Just got sent this photo from Ruby #Alaska as ...       1


## Loading BERT and Tokenizer

In [3]:
tokenizer: BertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

## Tokenizing Each Sentence

In [32]:
question = "Is a disaster happening?"
tokenized_question = tokenizer.encode(question, add_special_tokens=True)
tokenized = dataset['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

print(tokenized.head())

0    [101, 2256, 15616, 2024, 1996, 3114, 1997, 202...
1    [101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...
2    [101, 2035, 3901, 2356, 2000, 1005, 7713, 1999...
3    [101, 2410, 1010, 2199, 2111, 4374, 1001, 3748...
4    [101, 2074, 2288, 2741, 2023, 6302, 2013, 1009...
Name: text, dtype: object


In [33]:
test = tokenized[0]

print(tokenizer.convert_ids_to_tokens(test))

['[CLS]', 'our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all', '[SEP]']


## Running tokens through bert

In [35]:
model.cuda()
model.eval()


pooled = np.zeros((len(tokenized), 768))
with torch.no_grad():
    for i,seq in enumerate(tqdm(tokenized, total=len(tokenized))):
        #info_embed = torch.ones(1, len(seq)).long().cuda()
        #info_embed[0, :len(tokenized_question)+2] = 0
        prepped_input = torch.tensor(seq).unsqueeze(0).cuda()
        out = model(prepped_input)
        pooled[i] = (out[1].cpu().detach().numpy())

  0%|          | 0/7613 [00:00<?, ?it/s]

100%|██████████| 7613/7613 [01:22<00:00, 92.02it/s]


## Preparing saving of data with labels and text

In [36]:
new_data = pd.DataFrame(pooled)

new_data['target'] = dataset['target']
new_data['text'] = dataset['text']

print(new_data.head())

new_data.to_csv('data/processed_bert.csv', index=False)


          0         1         2         3         4         5         6  \
0 -0.822546 -0.472057 -0.579868  0.601334  0.001131 -0.033307  0.621883   
1 -0.932106 -0.444832 -0.946184  0.783482  0.649199 -0.417868  0.810984   
2 -0.786213 -0.454594 -0.989117  0.766382  0.825029 -0.260707  0.601579   
3 -0.929848 -0.641187 -0.984187  0.838084  0.698074 -0.439465  0.848216   
4 -0.782167 -0.475203 -0.785371  0.579609  0.718675 -0.169704  0.333027   

          7         8         9  ...       760       761       762       763  \
0  0.219169 -0.297440 -0.999933  ...  0.746353  0.789280  0.454223  0.652186   
1  0.553593 -0.841144 -0.999991  ...  0.972591  0.666550 -0.809021  0.077952   
2  0.324533 -0.961853 -0.999995  ...  0.992954  0.505823 -0.130431 -0.429334   
3  0.544355 -0.940431 -0.999996  ...  0.995336  0.807030 -0.736760 -0.092651   
4  0.279532 -0.594330 -0.999871  ...  0.968012  0.792530  0.080506  0.567232   

        764       765       766       767  target  \
0  0.384898 -0.