# Imports

In [1]:
import torch
from torch.utils.data import DataLoader
from torch import nn

from transformers import BertTokenizer, BertModel

import pandas as pd

import numpy as np

from tqdm import tqdm

from NNs import BERTFinetune, TextDataSet

from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

# Loading Tweets

In [2]:
data = pd.read_csv('data/train.csv')

dataset = data[['text', 'target']]

print(dataset.head())

                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1
2  All residents asked to 'shelter in place' are ...       1
3  13,000 people receive #wildfires evacuation or...       1
4  Just got sent this photo from Ruby #Alaska as ...       1


# Creating Datasets

In [3]:
max_len = 512
encoder = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_x, test_x, train_y, test_y = train_test_split(dataset['text'].tolist(), dataset['target'].tolist(), test_size=0.2)

full_dataset = TextDataSet(dataset['text'].tolist(), dataset['target'].tolist(), encoder, max_len)
train_dataset = TextDataSet(train_x, train_y, encoder, max_len)
test_dataset = TextDataSet(test_x, test_y, encoder, max_len)

full_loader = DataLoader(full_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

## Setting which layers are Fine Tuned

In [4]:
model = BERTFinetune('bert-base-uncased', 2, 0.1)

## Turning off gradients except for the last layer of BERT

for name, param in model.named_parameters():
    if name.startswith('bert'):
        if  '11' not in name:
            param.requires_grad = False
    print(name, param.requires_grad)

#for name, param in model.named_parameters():
    #if name.startswith('bert'):
        #if 'pooler' not in name:
            #param.requires_grad = False
    #print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

## Training

In [5]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
model.finetune()

torch.cuda.empty_cache()

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        y = batch['labels'].to(device)
        optimizer.zero_grad()
        output = model(input_ids, attention_mask)
        preds = torch.argmax(output, dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch: {epoch}, Loss: {total_loss}, Accuracy: {correct/total}')

    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            y = batch['labels'].to(device)
            output = model(input_ids, attention_mask)
            preds = torch.argmax(output, dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
        print(f'Accuracy: {correct/total}')

100%|██████████| 381/381 [02:37<00:00,  2.42it/s]

Epoch: 0, Loss: 204.70671164989471, Accuracy: 0.7435139573070607





Accuracy: 0.7971109652002626


100%|██████████| 381/381 [02:22<00:00,  2.67it/s]

Epoch: 1, Loss: 160.2280105650425, Accuracy: 0.8203612479474548





Accuracy: 0.7984241628365069


100%|██████████| 381/381 [02:23<00:00,  2.65it/s]

Epoch: 2, Loss: 152.21619949489832, Accuracy: 0.8307060755336617





Accuracy: 0.8063033486539725


100%|██████████| 381/381 [02:24<00:00,  2.64it/s]

Epoch: 3, Loss: 145.90655648708344, Accuracy: 0.8397372742200329





Accuracy: 0.8102429415627052


100%|██████████| 381/381 [02:24<00:00,  2.64it/s]

Epoch: 4, Loss: 140.5271005704999, Accuracy: 0.8461412151067323





Accuracy: 0.8030203545633617


## Creating Embeddings for fine tuned output

In [6]:
model.feature_extractor()
model.eval()

data_features = []

with torch.no_grad():
    for batch in tqdm(full_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = model(input_ids, attention_mask)
        data_features.append(output)

data_features = torch.cat(data_features, dim=0)

new_data = pd.DataFrame(data_features.cpu().numpy())

new_data['target'] = dataset['target']
new_data['text'] = dataset['text']

print(new_data.head())

new_data.to_csv('data/processed_fine_encode1_bert.csv', index=False)



  0%|          | 0/119 [00:00<?, ?it/s]

100%|██████████| 119/119 [02:27<00:00,  1.24s/it]


          0         1         2         3         4         5         6  \
0  0.165505 -0.464544 -0.878968  0.104093  0.144859 -0.010218 -0.793602   
1  0.793801  0.142984 -0.994426 -0.622726  0.564186 -0.093283 -0.977231   
2  0.166598 -0.129740 -0.992731  0.356827  0.712300 -0.244056 -0.610771   
3  0.759550 -0.076369 -0.993795 -0.533363  0.214771  0.063852 -0.971475   
4  0.628048 -0.105182 -0.773249 -0.359005  0.464331  0.129426 -0.944774   

          7         8         9  ...       760       761       762       763  \
0 -0.160206 -0.674431 -0.295863  ...  0.949632 -0.511951  0.844009  0.084652   
1  0.011551 -0.950209  0.977353  ...  0.972372 -0.979019 -0.040529 -0.939062   
2 -0.015555 -0.972561 -0.943080  ...  0.992952 -0.657642  0.653601 -0.829776   
3 -0.081547 -0.952727  0.963216  ...  0.982699 -0.940269  0.349705 -0.877414   
4 -0.189503 -0.542479  0.894093  ...  0.951401 -0.650880  0.848035 -0.007230   

        764       765       766       767  target  \
0 -0.691045 -0.