In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics.functional import accuracy, f1_score, auroc


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
task_df = pd.read_csv('../../res/preprocessed/task2/task2.csv')
task_df = task_df.dropna()
task_df.isnull().sum()

text    0
HATE    0
NONE    0
OFFN    0
PRFN    0
dtype: int64

In [3]:

train_df, val_df = train_test_split(task_df, test_size=0.3)
# Divide validation df to validation and test dataframes
val_df, test_df = train_test_split(val_df, test_size=0.5)
print(len(train_df))
print(len(val_df))
print(len(test_df))


2594
556
556


In [4]:
LABEL_COLUMNS = list(train_df.columns)
LABEL_COLUMNS.remove('text')

id2label = {idx: label for idx, label in enumerate(LABEL_COLUMNS)}
label2id = {label: idx for idx, label in enumerate(LABEL_COLUMNS)}

print(id2label)
print(label2id)


{0: 'HATE', 1: 'NONE', 2: 'OFFN', 3: 'PRFN'}
{'HATE': 0, 'NONE': 1, 'OFFN': 2, 'PRFN': 3}


In [5]:
class TwitterDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_token_len: 256, batch_size=16):
        self.data = data_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __getitem__(self, idx):
        data_row = self.data.iloc[idx]
        text = data_row.text
        labels = data_row[LABEL_COLUMNS]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return dict(
            input_ids=encoding['input_ids'].flatten(),
            attention_mask=encoding['attention_mask'].flatten(),
            labels=torch.FloatTensor(labels)
        )

    def __len__(self):
        return len(self.data)


In [6]:
BERT_MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

train_dataset = TwitterDataset(train_df, tokenizer, max_token_len=256)
test_dataset = TwitterDataset(test_df, tokenizer, max_token_len=256)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [9]:
sample_batch = iter(test_loader).next()
sample_batch

{'input_ids': tensor([[ 101, 6616, 2023,  ...,    0,    0,    0],
         [ 101, 2017, 2024,  ...,    0,    0,    0],
         [ 101, 2028, 2518,  ...,    0,    0,    0],
         ...,
         [ 101, 1996, 1996,  ...,    0,    0,    0],
         [ 101, 2005, 2469,  ...,    0,    0,    0],
         [ 101, 2043, 2017,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.]])}

In [10]:
sample_batch["input_ids"].shape, sample_batch["attention_mask"].shape

(torch.Size([8, 256]), torch.Size([8, 256]))

In [11]:
model = AutoModel.from_pretrained(BERT_MODEL_NAME)
output = model(sample_batch["input_ids"], sample_batch["attention_mask"])
output.last_hidden_state.shape, output.pooler_output.shape


(torch.Size([8, 256, 768]), torch.Size([8, 768]))

In [12]:
class TwitterDataModule(pl.LightningDataModule):
    def __init__(self, train_df, val_df, test_df, tokenizer, batch_size=8, max_token_len=128):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = TwitterDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )
        self.val_dataset = TwitterDataset(
            self.val_df,
            self.tokenizer,
            self.max_token_len
        )
        self.test_dataset = TwitterDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            # num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            # num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            # num_workers=4
        )


In [13]:
N_EPOCHS = 2
BATCH_SIZE = 12
MAX_TOKEN_COUNT = 256
data_module = TwitterDataModule(
    train_df,
    val_df,
    test_df,
    tokenizer,
    batch_size=BATCH_SIZE,
    max_token_len=MAX_TOKEN_COUNT
)
data_module.setup()

In [14]:
class TwitterNeuralNet(pl.LightningModule):
    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = AutoModel.from_pretrained(
            BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        pooled = torch.mean(output.last_hidden_state,1)
        output = self.classifier(pooled)
        output = torch.sigmoid(output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        
        return loss, output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss,prog_bar=True, logger=True)
        return {"val_loss": loss, "predictions": outputs, "labels": labels}

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return outputs

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )
        )


In [15]:
steps_per_epoch = len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
steps_per_epoch, total_training_steps

(216, 432)

In [16]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps


(86, 432)

In [17]:
model = TwitterNeuralNet(
    n_classes=len(LABEL_COLUMNS),
    n_warmup_steps=warmup_steps,
    n_training_steps=total_training_steps
)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
_, predictions = model(
    sample_batch["input_ids"], sample_batch["attention_mask"])
# predictions


In [19]:
logger = TensorBoardLogger("lightning_logs",
 'twitter-task2')

In [20]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)


In [21]:
trainer = pl.Trainer(
    max_epochs=N_EPOCHS, gpus=1,
    progress_bar_refresh_rate=3,
    logger=logger,
  checkpoint_callback=checkpoint_callback,)


  rank_zero_deprecation(
  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
trainer.fit(model, data_module)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 109 M 
1 | classifier | Linear    | 3.1 K 
2 | criterion  | BCELoss   | 0     
-----------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.941   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:01<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 1: 100%|██████████| 264/264 [06:04<00:00,  1.38s/it, loss=0.169, v_num=4, train_loss=0.045, val_loss=0.259] 


In [23]:
trainer.test(model, data_module.test_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 47/47 [00:11<00:00,  3.98it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.2326652705669403
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.2326652705669403}]

### EXPORT MODEL

In [None]:
PATH = './saved_model'
torch.save(model.state_dict(), PATH)

### EVALUATE MODEL

In [32]:
import copy
trained_model = copy.deepcopy(model)
trained_model.eval()
trained_model.freeze()

In [None]:
test_text = 'Die you useless piece of trash'
test_text = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,
    max_length=256,
    return_token_type_ids=False,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

test_input_ids, test_att_mask = test_text['input_ids'], test_text['attention_mask']
_, output = trained_model(test_input_ids, test_att_mask)

# _, output = model(test_input_ids.unsqueeze(0), test_att_mask.unsqueeze(0),None)
print(output)
print(id2label)
print(id2label[int(torch.argmax(output))])


In [54]:
len(test_dataset)

556

In [87]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

predictions = []
labels = []

counter = 0
true_counter = 0

for idx, item in enumerate(test_dataset):
    _, prediction = trained_model(
       item['input_ids'].unsqueeze(dim=0).to(device),
       item['attention_mask'].unsqueeze(dim=0).to(device)
    )
    predictions.append(prediction.flatten())
    true_label = id2label[int(torch.argmax(item['labels']))]
    guess_label = id2label[int(torch.argmax(predictions[counter]))]
    
    if counter % 50 == 0:
      text_val = test_df.iloc[idx]['text']
      print(f'{true_label} | {guess_label} {text_val}')
    if true_label==guess_label:
        true_counter+=1
   
    labels.append(item['labels'].int())
    counter += 1

print(f'Accuracy: {true_counter/len(test_dataset)}')

PRFN | PRFN fuck this
NONE | NONE the mother of dragons
OFFN | NONE as in  some people need deliverance
NONE | NONE  scroll downkeep scrollingnow you are obligated to stre
NONE | NONE days of work down more to go send help and chocolate x
NONE | PRFN i sholl was fixing to ssy damn sis big ole freak
NONE | PRFN  me tries to sneak a vape elicia are you fucking kidding me are you vaping during this loona session i have never bee
NONE | NONE  scroll downkeep scrollingnow you are obligated to re
PRFN | PRFN  its too late for all that lovey dovey shit
NONE | NONE i will make you trust me and when you have told me everything i need to know i will kill you light
PRFN | PRFN i forgot how much buying concert tickets stresses me the fuck out
PRFN | PRFN fuck yeah it is how stupid of me sorry was just rage tweeting n not paying attention
Accuracy: 0.8309352517985612


In [83]:
predictions_ = torch.stack(predictions).detach().cpu()
labels_ = torch.stack(labels).detach().cpu()


THRESHOLD = 0.5

accuracy_score = accuracy(predictions_, labels_, threshold=THRESHOLD)
accuracy_score

tensor(0.9173)

In [38]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs/twitter-task2
%matplotlib inline


Reusing TensorBoard on port 6006 (pid 7180), started 5:40:18 ago. (Use '!kill 7180' to kill it.)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)



#### TRAINING


In [None]:
# import pickle
# filename = './saved_models/task1_model.pkl'
# outfile = open(filename, 'wb')
# pickle.dump(model, outfile)
# outfile.close()


In [None]:
# import torch
# import pickle
# from transformers import DistilBertTokenizerFast
# import torch.nn.functional as F

# device = torch.device(
#     'cuda') if torch.cuda.is_available() else torch.device('cpu')
# model.to(device)

# pickle_in = open('./saved_models/task1_model.pkl', 'rb')
# model = pickle.load(pickle_in)
# model = model.to(device)

# model_name = "distilbert-base-uncased"
# tokenizer = AutoModel.from_pretrained(model_name)


In [None]:
# sample_test = ['All asian people are criminals'.lower()]
# sample_test = tokenizer(sample_test, padding=True, truncation=True,
#                         max_length=512, return_tensors='pt').to(device)

# with torch.no_grad():
#     outputs = model(**sample_test)
#     prediction_batch = F.softmax(outputs.logits, dim=1)
#     prediction_batch_label_id = torch.argmax(prediction_batch, dim=1)
#     sample_test_prediction_label = model.config.id2label[int(
#         prediction_batch_label_id[0])]
#     print(sample_test_prediction_label)


HOF


In [None]:
# import torch
# import torch.nn.functional as F

# with torch.no_grad():
#     n_correct = 0
#     n_samples = 0

#     for item in test_loader:

#         test_batch_text = item['input_ids']
#         test_batch_attention_mask = item['attention_mask']
#         test_batch_label_id = item['labels']
#         test_batch_text = test_batch_text.to(device)
#         test_batch_attention_mask = test_batch_attention_mask.to(device)
#         test_batch_label_id = test_batch_label_id.to(device)

#         outputs = model(test_batch_text)
#         prediction_batch = F.softmax(outputs.logits, dim=1)
#         prediction_batch_confidence = torch.max(prediction_batch, dim=1)
#         # print(prediction_batch)
#         # print('---------')
#         # print(prediction_batch_confidence)
#         # print('========')
#         prediction_batch_label_id = torch.argmax(prediction_batch, dim=1)
#         # prediction_labels = [model.config.id2label[label_id] for label_id in labels.tolist()]

#         n_samples += len(test_batch_text)
#         n_correct += (prediction_batch_label_id ==
#                       test_batch_label_id).sum().item()

#         for i in range(len(test_batch_text)):
#             if n_samples % 300 == 0:
#                 sample_test_text = tokenizer.decode(
#                     test_batch_text[i], skip_special_tokens=True)
#                 sample_test_label = model.config.id2label[int(
#                     test_batch_label_id[i])]
#                 sample_prediction_label = model.config.id2label[int(
#                     prediction_batch_label_id[i])]

#                 print(f'Prediction :{sample_prediction_label} | Actual: {sample_test_label} | \
#             Prediction Accuracy: ')

#     acc = 100.0 * n_correct/n_samples
#     print(f'Accuracy: {acc}')


Prediction :HOF | Actual: HOF |             Prediction Accuracy: 
Prediction :NOT | Actual: NOT |             Prediction Accuracy: 
Prediction :NOT | Actual: NOT |             Prediction Accuracy: 
Prediction :NOT | Actual: NOT |             Prediction Accuracy: 
Prediction :NOT | Actual: HOF |             Prediction Accuracy: 
Prediction :NOT | Actual: NOT |             Prediction Accuracy: 
Prediction :HOF | Actual: HOF |             Prediction Accuracy: 
Prediction :HOF | Actual: HOF |             Prediction Accuracy: 
Accuracy: 92.99191374663073
