In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics.functional import accuracy, f1_score, auroc


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
task_df = pd.read_csv('../res/preprocessed/task1_2/task1_2.csv')
task_df = task_df.dropna()
task_df.isnull().sum()


text    0
HOF     0
NOT     0
NONE    0
OFFN    0
PRFN    0
dtype: int64

In [3]:

train_df, val_df = train_test_split(task_df, test_size=0.3)
# Divide validation df to validation and test dataframes
val_df, test_df = train_test_split(val_df, test_size=0.5)
print(len(train_df))
print(len(val_df))
print(len(test_df))


2594
556
556


In [4]:
LABEL_COLUMNS = list(train_df.columns)
LABEL_COLUMNS.remove('text')

TASK1_LABELS = LABEL_COLUMNS[:2]
TASK2_LABELS = LABEL_COLUMNS[2:]


task1_id2label = {idx: label for idx, label in enumerate(TASK1_LABELS)}
task1_label2id = {label: idx for idx, label in enumerate(TASK1_LABELS)}

task2_label2id = {label: idx for idx, label in enumerate(TASK2_LABELS)}
task2_id2label = {idx: label for idx, label in enumerate(TASK2_LABELS)}

print(task1_id2label)
print(task1_label2id)
print(task2_id2label)
print(task2_label2id)

{0: 'HOF', 1: 'NOT'}
{'HOF': 0, 'NOT': 1}
{0: 'NONE', 1: 'OFFN', 2: 'PRFN'}
{'NONE': 0, 'OFFN': 1, 'PRFN': 2}


In [5]:
class TwitterDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_token_len: 256, batch_size=16):
        self.data = data_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __getitem__(self, idx):
        data_row = self.data.iloc[idx]
        text = data_row.text
        labels1 = data_row[TASK1_LABELS]
        labels2 = data_row[TASK2_LABELS]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return dict(
            input_ids=encoding['input_ids'].flatten(),
            attention_mask=encoding['attention_mask'].flatten(),
            labels1=torch.FloatTensor(labels1),
            labels2=torch.FloatTensor(labels2)
        )

    def __len__(self):
        return len(self.data)


In [6]:
BERT_MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

train_dataset = TwitterDataset(train_df, tokenizer, max_token_len=256)
test_dataset = TwitterDataset(test_df, tokenizer, max_token_len=256)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [7]:
sample_batch = iter(test_loader).next()
sample_batch


{'input_ids': tensor([[  101,  1045, 24637,  ...,     0,     0,     0],
         [  101,  2017,  2024,  ...,     0,     0,     0],
         [  101,  1045,  2215,  ...,     0,     0,     0],
         ...,
         [  101, 16245,  4679,  ...,     0,     0,     0],
         [  101,  2031,  2017,  ...,     0,     0,     0],
         [  101,  2065,  2017,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels1': tensor([[0., 1.],
         [1., 0.],
         [1., 0.],
         [0., 1.],
         [0., 1.],
         [1., 0.],
         [1., 0.],
         [1., 0.]]),
 'labels2': tensor([[1., 0., 0.],
         [0., 1., 0.],
         [0., 0., 1.],
         [1., 0., 0.],
         [1., 0., 0.],
         [0., 0., 1.],
         [0., 0., 0.],
         [0., 0., 1.]])}

In [8]:
sample_batch["input_ids"].shape, sample_batch["attention_mask"].shape


(torch.Size([8, 256]), torch.Size([8, 256]))

In [9]:
num_workers = 0

In [10]:

class TwitterDataModule(pl.LightningDataModule):
    def __init__(self, train_df, val_df, test_df, tokenizer, batch_size=12, max_token_len=256):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = TwitterDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )
        self.val_dataset = TwitterDataset(
            self.val_df,
            self.tokenizer,
            self.max_token_len
        )
        self.test_dataset = TwitterDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=num_workers,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=num_workers,
        )


In [11]:
N_EPOCHS = 2
BATCH_SIZE = 12
MAX_TOKEN_COUNT = 256

data_module = TwitterDataModule(
    train_df,
    val_df,
    test_df,
    tokenizer,
    batch_size=config["batch_size"],
    max_token_len=config["max_token_count"]
)
data_module.setup()


In [12]:
class TwitterNeuralNet(pl.LightningModule):
    def __init__(self, task1_n_classes: int, task2_n_classes: int,n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = AutoModel.from_pretrained(
            BERT_MODEL_NAME, return_dict=True)
        self.hidden = nn.Linear(self.bert.config.hidden_size,
                                self.bert.config.hidden_size)
                    
        self.task1_classifier = nn.Linear(self.bert.config.hidden_size, task1_n_classes)

        self.task2_classifier = nn.Linear(self.bert.config.hidden_size, task2_n_classes)

        torch.nn.init.xavier_uniform_(self.hidden.weight)
        torch.nn.init.xavier_uniform_(self.task1_classifier.weight)
        torch.nn.init.xavier_uniform_(self.task2_classifier.weight)

        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCEWithLogitsLoss(reduction='mean')
        self.dropout = nn.Dropout(0.2)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        # print(input_ids.shape)
        # print(attention_mask.shape)
        pooled_output = torch.mean(output.last_hidden_state, 1)
        pooled_output = self.hidden(pooled_output)
        # pooled_output = self.dropout(pooled_output)
        pooled_output = F.relu(pooled_output)
        
        output1 = self.task1_classifier(pooled_output)
        output2 = self.task2_classifier(pooled_output)
         
        loss = 0
        if labels is not None:
            loss1 = self.criterion(output1, labels['labels1'])
            loss2 = self.criterion(output2, labels['labels2'])
            loss = loss1+loss2
            
        return loss, [output1, output2]

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = {}
        labels['labels1'] = batch["labels1"]
        labels['labels2'] = batch["labels2"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = {}
        labels['labels1'] = batch["labels1"]
        labels['labels2'] = batch["labels2"]
        loss,outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return {"val_loss": loss, "predictions": outputs, "labels": labels}

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return outputs

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )
        )


In [13]:
steps_per_epoch = len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
steps_per_epoch, total_training_steps


(216, 432)

In [14]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(86, 432)

In [15]:
model = TwitterNeuralNet(
    task1_n_classes=len(TASK1_LABELS),
    task2_n_classes=len(TASK2_LABELS),
    n_warmup_steps=warmup_steps,
    n_training_steps=total_training_steps
)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
_, predictions = model(
    sample_batch["input_ids"], sample_batch["attention_mask"])
predictions


[tensor([[-0.2434,  0.1023],
         [-0.0088,  0.0569],
         [ 0.1594, -0.1648],
         [ 0.0766, -0.0815],
         [-0.1504, -0.0257],
         [ 0.0506, -0.0468],
         [-0.1782,  0.1036],
         [-0.1891, -0.0928]], grad_fn=<AddmmBackward0>),
 tensor([[-0.3534, -0.2036,  0.4339],
         [-0.2159, -0.1994,  0.4214],
         [-0.4694, -0.2018,  0.1452],
         [-0.2388, -0.2643,  0.2971],
         [-0.1931, -0.1378,  0.6452],
         [-0.2965, -0.3154,  0.4177],
         [-0.2580, -0.2072,  0.2522],
         [-0.3008, -0.3383,  0.2573]], grad_fn=<AddmmBackward0>)]

In [17]:
logger = TensorBoardLogger("./lightning_logs",
                           'twitter-task-all')

In [18]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)


In [19]:
trainer = pl.Trainer(
   max_epochs=N_EPOCHS,
   gpus=1,
   progress_bar_refresh_rate=3,
   logger=logger,
   checkpoint_callback=checkpoint_callback, num_sanity_val_steps=10,
   fast_dev_run=False,
   )



trainer.fit(model, data_module)


  rank_zero_deprecation(
  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type              | Params
-------------------------------------------------------
0 | bert             | BertModel         | 109 M 
1 | hidden           | Linear            | 590 K 
2 | task1_classifier | Linear            | 1.5 K 
3 | task2_classifier | Linear            | 2.3 K 
4 | criterion        | BCEWithLogitsLoss | 0     
5 | dropout          | Dropout           | 0     
-------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
440.307   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                             

  rank_zero_warn(


Epoch 1: 100%|██████████| 264/264 [05:52<00:00,  1.34s/it, loss=0.394, v_num=11, train_loss=0.279, val_loss=0.487] 


In [20]:
import os
import torch

PATH = './saved_model'
torch.save(model.state_dict(), os.path.join(PATH, "model.pth"))


In [21]:
import copy
trained_model = copy.deepcopy(model)
trained_model.eval()
trained_model.freeze()


In [22]:
test_text = 'Hey asshole, go kill yourself'
test_text = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,
    max_length=256,
    return_token_type_ids=False,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
with torch.no_grad():
    test_input_ids, test_att_mask = test_text['input_ids'], test_text['attention_mask']
    _, output = trained_model(test_input_ids, test_att_mask)

# _, output = model(test_input_ids.unsqueeze(0), test_att_mask.unsqueeze(0),None)
print('TASK 1')
print(output[0])
print(task1_id2label)
print(task1_id2label[int(torch.argmax(output[0]))])

print('TASK 2')
print(output[1])
print(task2_id2label)
print(task2_id2label[int(torch.argmax(output[1]))])



TASK 1
tensor([[ 4.2146, -3.7787]])
{0: 'HOF', 1: 'NOT'}
HOF
TASK 2
tensor([[-4.3500, -1.2555,  1.3852]])
{0: 'NONE', 1: 'OFFN', 2: 'PRFN'}
PRFN


In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

task1_predictions = []
task1_labels = []

task2_predictions = []
task2_labels = []

counter = 0
task1_true_counter = 0
task2_true_counter = 0
with torch.no_grad():
    for idx, item in enumerate(test_dataset):
        _, prediction = trained_model(
            item['input_ids'].unsqueeze(dim=0).to(device),
            item['attention_mask'].unsqueeze(dim=0).to(device)
        )
    
        task1_predictions.append(prediction[0].flatten())
        task2_predictions.append(prediction[1].flatten())

        task1_true_label = task1_id2label[int(torch.argmax(item['labels1']))]
        task2_true_label = task2_id2label[int(torch.argmax(item['labels2']))]



        task1_guess_label = task1_id2label[int(torch.argmax(task1_predictions[counter]))]
        task2_guess_label = task2_id2label[int(
            torch.argmax(task2_predictions[counter]))]
        

        if counter % 20 == 0:
          text_val = test_df.iloc[idx]['text']
          print(text_val)
          print(f'TASK 1 \
              True: {task1_true_label} |  Prediction: {task1_guess_label}')
          print(item['labels1'])
          print(prediction[0])
          print(f'TASK 2 \
              True: {task2_true_label} |  Prediction: {task2_guess_label}')
          print(item['labels2'])
          print(prediction[1])
          print('--------------------')
        
        if task1_true_label == task1_guess_label:
            task1_true_counter += 1
        if task2_true_label == task2_guess_label:
            task2_true_counter += 1
        
        task1_labels.append(item['labels1'].int())
        task2_labels.append(item['labels2'].int())
        counter += 1

print(f'Accuracy Task1: {task1_true_counter/len(test_dataset)}')
print(f'Accuracy Task2: {task2_true_counter/len(test_dataset)}')


i texted my boss that i am sick and he never replied lol
TASK 1               True: NOT |  Prediction: NOT
tensor([0., 1.])
tensor([[-2.6155,  2.3481]], device='cuda:0')
TASK 2               True: NONE |  Prediction: NONE
tensor([1., 0., 0.])
tensor([[ 1.5972, -3.2779, -2.4990]], device='cuda:0')
--------------------
there is not much in the barrel to scrape personally i thought maybot would have chosen chris grayling
TASK 1               True: NOT |  Prediction: NOT
tensor([0., 1.])
tensor([[-3.2557,  3.2370]], device='cuda:0')
TASK 2               True: NONE |  Prediction: NONE
tensor([1., 0., 0.])
tensor([[ 3.2259, -3.6569, -4.0340]], device='cuda:0')
--------------------
young kids abuse pm modi in priyanka vadras presence
TASK 1               True: NOT |  Prediction: NOT
tensor([0., 1.])
tensor([[-1.8651,  1.8223]], device='cuda:0')
TASK 2               True: NONE |  Prediction: NONE
tensor([1., 0., 0.])
tensor([[ 1.4282, -2.7726, -4.2780]], device='cuda:0')
--------------------
y

In [24]:
task1_predictions_ = torch.stack(task1_predictions).detach().cpu()
task1_labels_ = torch.stack(task1_labels).detach().cpu()

task2_predictions_ = torch.stack(task2_predictions).detach().cpu()
task2_labels_ = torch.stack(task2_labels).detach().cpu()

In [25]:
THRESHOLD = 0.5

task1_accuracy_score = accuracy(
    task1_predictions_, task1_labels_, threshold=THRESHOLD)

task2_accuracy_score = accuracy(
    task2_predictions_, task2_labels_, threshold=THRESHOLD)

print(f'task1_accuracy_score: {task1_accuracy_score}')
print(f'task2_accuracy_score: {task2_accuracy_score}')


task1_accuracy_score: 0.8965827226638794
task2_accuracy_score: 0.8986810445785522


In [26]:
ksdk'askl;df

SyntaxError: EOL while scanning string literal (729958493.py, line 1)

In [None]:
import os
import torch

PATH = './saved_model'
sample_input_ids = train_dataset[0]['input_ids']
# print(sample_input_ids)
sample_att_mask = train_dataset[0]['attention_mask']
# print(sample_att_mask)

torch.jit.save(torch.jit.trace(model,(sample_input_ids, sample_att_mask) ), os.path.join(PATH, "model.pth"))


  if hasattr(mod, name):
  item = getattr(mod, name)
  if hasattr(mod, name):
  item = getattr(mod, name)


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
import torch
from transformers import AutoTokenizer

PATH = './saved_model'
BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

loaded_model = torch.load(os.path.join(PATH, "model.pth"))

In [None]:
test_text = 'Hey asshole, Go kill yourself'
test_text = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,
    max_length=256,
    return_token_type_ids=False,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

with torch.no_grad():
    test_input_ids, test_att_mask = test_text['input_ids'], test_text['attention_mask']
    _, output = loaded_model(test_input_ids, test_att_mask)

# _, output = model(test_input_ids.unsqueeze(0), test_att_mask.unsqueeze(0),None)
print('TASK 1')
print(output[0])
print(task1_id2label)
print(task1_id2label[int(torch.argmax(output[0]))])

print('TASK 2')
print(output[1])
print(task2_id2label)
print(task2_id2label[int(torch.argmax(output[1]))])


TypeError: 'collections.OrderedDict' object is not callable