In [None]:
# Reference tutorial:
# https://velog.io/@na2na8/ELECTRA%EB%A1%9C-Binary-Classification#electra-with-pytorch-lightning

In [1]:
!pip install transformers --quiet

[K     |████████████████████████████████| 4.2 MB 12.4 MB/s 
[K     |████████████████████████████████| 84 kB 1.7 MB/s 
[K     |████████████████████████████████| 596 kB 76.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 47.8 MB/s 
[?25h

In [2]:
!pip install git+https://github.com/PyTorchLightning/pytorch-lightning --quiet
import pytorch_lightning as pl
print(pl.__version__)

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 409 kB 14.4 MB/s 
[K     |████████████████████████████████| 140 kB 71.3 MB/s 
[K     |████████████████████████████████| 1.1 MB 67.5 MB/s 
[K     |████████████████████████████████| 144 kB 73.3 MB/s 
[K     |████████████████████████████████| 271 kB 60.5 MB/s 
[K     |████████████████████████████████| 94 kB 3.7 MB/s 
[?25h  Building wheel for pytorch-lightning (PEP 517) ... [?25l[?25hdone
1.7.0dev


In [3]:
import os
import re

import numpy as np
import pandas as pd

import torch
import torchmetrics
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from pytorch_lightning import loggers as pl_loggers
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import transformers
from transformers import ElectraForSequenceClassification, ElectraTokenizer, AdamW

device = torch.device("cuda")

In [4]:
!git clone https://github.com/AyushiM1102/Electra_classification_fake_vs_real_news.git

Cloning into 'Electra_classification_fake_vs_real_news'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 58 (delta 13), reused 19 (delta 11), pack-reused 35[K
Unpacking objects: 100% (58/58), done.


In [5]:
!unzip /content/Electra_classification_fake_vs_real_news/data/WELFake_Dataset.csv.zip -d /content/Electra_classification_fake_vs_real_news/dataset

Archive:  /content/Electra_classification_fake_vs_real_news/data/WELFake_Dataset.csv.zip
  inflating: /content/Electra_classification_fake_vs_real_news/dataset/WELFake_Dataset.csv  
  inflating: /content/Electra_classification_fake_vs_real_news/dataset/__MACOSX/._WELFake_Dataset.csv  


In [10]:
datapath = f'/content/Electra_classification_fake_vs_real_news/dataset/WELFake_Dataset.csv'
df = pd.read_csv(datapath, sep=',')
df = df.dropna(axis=0)


In [11]:
df['label'].value_counts()

1    36509
0    35028
Name: label, dtype: int64

In [12]:
df.notna().sum()

Unnamed: 0    71537
title         71537
text          71537
label         71537
dtype: int64

In [14]:
class ElectraClassificationDataset(Dataset) :
    def __init__(self, path, sep, doc_col, label_col, max_length, num_workers=1, labels_dict=None) :

        self.tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
        self.max_length = max_length
        self.doc_col = doc_col
        self.label_col = label_col

        # labels, ex : {True : 1, False : 0}
        self.labels_dict = labels_dict

        # dataset
        df = pd.read_csv(path, sep=sep)
        df = df.dropna(axis=0)
        df.drop_duplicates(subset=[self.doc_col], inplace=True)
        self.dataset = df

    def __len__(self) :
        return len(self.dataset)
    
    # Clean text
    def cleanse(self, text) :
        url_pattern = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
        processed = url_pattern.sub(' ', text)
        processed = processed.replace('#', '')
        processed = processed.replace('@', '')
        processed = processed.strip()

        return processed

    def __getitem__(self, idx) :
        document = self.cleanse(self.dataset[self.doc_col].iloc[idx])
        #print(document)
        inputs = self.tokenizer(
            document,
            return_tensors='pt',
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            add_special_tokens=True
        )

        if self.labels_dict :
            label = self.labels_dict[self.dataset[self.label_col].iloc[idx]]
        else :
            label = self.dataset[self.label_col].iloc[idx]

        return {
            'input_ids' : inputs['input_ids'][0],
            'attention_mask' : inputs['attention_mask'][0],
            'label' : int(label)
        }

In [15]:
class ElectraClassificationDataModule(pl.LightningDataModule) :
    def __init__(self, train_path, valid_path, max_length, batch_size, sep,
                doc_col, label_col, num_workers=1, labels_dict=None) :
        super().__init__()
        self.batch_size = batch_size
        self.train_path = train_path
        self.valid_path = valid_path
        self.max_length = max_length
        self.doc_col = doc_col
        self.label_col = label_col
        self.sep = sep
        self.num_workers = num_workers
        self.labels_dict = labels_dict

    def setup(self, stage=None) :
        self.set_train = ElectraClassificationDataset(self.train_path, sep=self.sep,
                                            doc_col=self.doc_col, label_col=self.label_col,
                                            max_length = self.max_length, labels_dict=self.labels_dict)
        self.set_valid = ElectraClassificationDataset(self.valid_path, sep=self.sep,
                                            doc_col=self.doc_col, label_col=self.label_col,
                                            max_length = self.max_length, labels_dict=self.labels_dict)

    def train_dataloader(self) :
        train = DataLoader(self.set_train, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)
        return train
    
    def val_dataloader(self) :
        val = DataLoader(self.set_valid, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
        return val
    
    def test_dataloader(self) :
        test = DataLoader(self.set_valid, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
        return test

In [16]:
electra = ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator")


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [25]:
train_size = int(0.8 * len(df))
val_size = int(0.5*(len(df) - train_size))
test_size = int(val_size)
train_size, val_size, test_size

(57229, 7154, 7154)

In [32]:
train_dataset, val_dataset, test_dataset = df[:train_size],df[train_size:train_size+val_size],df[train_size+val_size:train_size+val_size+test_size]

In [33]:
len(train_dataset),len(val_dataset),len(test_dataset)

(57229, 7154, 7154)

In [50]:
train_dataset

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
...,...,...,...,...
57696,57696,WHY FINAL “Sunday Night Football” Game Was Can...,The NFL announced Tuesday that the final Sund...,1
57697,57697,Catalan police say Sagrada Familia bomb scare ...,MADRID (Reuters) - Catalan police declared a f...,0
57698,57698,WATCH KELLYANNE CONWAY vs Total Jerk Chris Cuo...,https://www.youtube.com/watch?v=6VN1maBEKIk,1
57699,57699,Britain summons North Korean ambassador over m...,LONDON (Reuters) - The British Foreign Office ...,0


In [43]:
train_dataset.to_csv('/content/Electra_classification_fake_vs_real_news/dataset/train.csv', index = False)
val_dataset.to_csv('/content/Electra_classification_fake_vs_real_news/dataset/val.csv', index = False)

In [56]:

# Check parameters
training = f'/content/Electra_classification_fake_vs_real_news/dataset/train.csv'
validating = f'/content/Electra_classification_fake_vs_real_news/dataset/val.csv'
dm = ElectraClassificationDataModule(batch_size=8, train_path=training, valid_path=validating,
                                    max_length=256, sep=',', doc_col='text', label_col='label', num_workers=1)

dm.setup()

t = dm.train_dataloader()
print(t)

# for idx, data in enumerate(t) :
#     print('labels:',idx, data['input_ids'].shape, data['attention_mask'].shape, data['label'].shape)
#     # print(idx, data['input_ids'], data['attention_mask'], data['label'])

# Concatanate the batches ********* PENDING ***********
# idx, data = enumerate(t)

v = dm.val_dataloader()

#for idx, data in enumerate(v) :
    #print(idx, data['input_ids'].shape, data['attention_mask'].shape, data['label'].shape)

# print("This are the ids")
# print(data['input_ids'])
# print(data['input_ids'].shape)

# print("This are the masks")
# print(data['attention_mask'])
# print(data['attention_mask'].shape)

# print("This are the labels")
# print(data['label'])
# print(data['label'].shape)

output = electra.forward(data['input_ids'], attention_mask=data['attention_mask'], labels=data['label'].view([-1,1]))


print("This is the loss")
print(output.loss)
# print(output.loss.shape)
print(output.logits)
print(output.logits.shape)

softmax = nn.functional.softmax(output.logits, dim=1)
print('softmax', softmax)
pred = softmax.argmax(dim=1)
print('pred', pred)

y_true = data['label'].tolist()
y_pred = pred.tolist()

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f'acc : {acc}, prec : {prec}, rec : {rec}, f1 : {f1}')


<torch.utils.data.dataloader.DataLoader object at 0x7fb8c7db2d50>
This is the loss
tensor(0.7055, grad_fn=<NllLossBackward0>)
tensor([[-0.0549,  0.0007],
        [-0.0691, -0.0081],
        [-0.0635, -0.0011],
        [-0.0691, -0.0041],
        [-0.0892,  0.0011],
        [-0.0668, -0.0079],
        [-0.0604, -0.0099],
        [-0.0625,  0.0134]], grad_fn=<AddmmBackward0>)
torch.Size([8, 2])
softmax tensor([[0.4861, 0.5139],
        [0.4847, 0.5153],
        [0.4844, 0.5156],
        [0.4838, 0.5162],
        [0.4774, 0.5226],
        [0.4853, 0.5147],
        [0.4874, 0.5126],
        [0.4810, 0.5190]], grad_fn=<SoftmaxBackward0>)
pred tensor([1, 1, 1, 1, 1, 1, 1, 1])
acc : 0.375, prec : 0.375, rec : 1.0, f1 : 0.5454545454545454


In [None]:
# https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d
# https://huggingface.co/docs/transformers/v4.15.0/en/model_doc/electra#transformers.ElectraForSequenceClassification

In [None]:
class ElectraClassification(pl.LightningModule) :
    def __init__(self, learning_rate) :
        super().__init__()
        self.learning_rate = learning_rate
        self.save_hyperparameters()
        self.electra = ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator")

        self.metric_acc = torchmetrics.Accuracy()
        self.metric_f1 = torchmetrics.F1Score(num_classes=2)
        self.metric_rec = torchmetrics.Recall(num_classes=2)
        self.metric_pre = torchmetrics.Precision(num_classes=2)

        self.loss_func = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None) :
        output = self.electra(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output

    def training_step(self, batch, batch_idx) :
        '''
        ##########################################################
        electra forward input shape information
        * input_ids.shape (batch_size, max_length)
        * attention_mask.shape (batch_size, max_length)
        * label.shape (batch_size,)
        ##########################################################
        '''

        # change label shape (list -> torch.Tensor((batch_size, 1)))
        label = batch['label'].view([-1,1])

        output = self(input_ids=batch['input_ids'].to(device),
                        attention_mask=batch['attention_mask'].to(device),
                        labels=label.to(device))
        '''
        ##########################################################
        electra forward output shape information
        * loss.shape (1,)
        * logits.shape (batch_size, config.num_labels=2)
        '''
        logits = output.logits

        loss = output.loss
        # loss = self.loss_func(logits.to(device), batch['label'].to(device))

        softmax = nn.functional.softmax(logits, dim=1)
        preds = softmax.argmax(dim=1)

        self.log("train_loss", loss, prog_bar=True)
        
        return {
            'loss' : loss,
            'pred' : preds,
            'label' : batch['label']
        }

    def training_epoch_end(self, outputs, state='train') :
        y_true = []
        y_pred = []
        for i in outputs :
            y_true += i['label'].tolist()
            y_pred += i['pred'].tolist()

        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred)
        rec = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        # self.log(state+'_acc', acc, on_epoch=True, prog_bar=True)
        # self.log(state+'_precision', prec, on_epoch=True, prog_bar=True)
        # self.log(state+'_recall', rec, on_epoch=True, prog_bar=True)
        # self.log(state+'_f1', f1, on_epoch=True, prog_bar=True)
        print(f'[Epoch {self.trainer.current_epoch} {state.upper()}] Acc: {acc}, Prec: {prec}, Rec: {rec}, F1: {f1}')

    def validation_step(self, batch, batch_idx) :
        '''
        ##########################################################
        electra forward input shape information
        * input_ids.shape (batch_size, max_length)
        * attention_mask.shape (batch_size, max_length)
        ##########################################################
        '''
        output = self(input_ids=batch['input_ids'].to(device),
                        attention_mask=batch['attention_mask'].to(device))
        logits = output.logits
        preds = nn.functional.softmax(logits, dim=1).argmax(dim=1)

        labels = batch['label']
        accuracy = self.metric_acc(preds, labels)
        f1 = self.metric_f1(preds, labels)
        recall = self.metric_rec(preds, labels)
        precision = self.metric_pre(preds, labels)
        self.log('val_accuracy', accuracy, on_epoch=True, prog_bar=True)
        self.log('val_f1', f1, on_epoch=True, prog_bar=True)
        self.log('val_recall', recall, on_epoch=True, prog_bar=True)
        self.log('val_precision', precision, on_epoch=True, prog_bar=True)

        return {
            'accuracy' : accuracy,
            'f1' : f1,
            'recall' : recall,
            'precision' : precision
        }

    def validation_epoch_end(self, outputs) :
        val_acc = torch.stack([i['accuracy'] for i in outputs]).mean()
        val_f1 = torch.stack([i['f1'] for i in outputs]).mean()
        val_rec = torch.stack([i['recall'] for i in outputs]).mean()
        val_pre = torch.stack([i['precision'] for i in outputs]).mean()
        # self.log('val_f1', val_f1, on_epoch=True, prog_bar=True)
        # self.log('val_acc', val_acc, on_epoch=True, prog_bar=True)
        print(f'val_accuracy : {val_acc}, val_f1 : {val_f1}, val_recall : {val_rec}, val_precision : {val_pre}')
        
    
    def configure_optimizers(self) :
        optimizer = torch.optim.AdamW(self.electra.parameters(), lr=self.learning_rate)
        lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
        
        return {
            'optimizer' : optimizer,
            'lr_scheduler' : lr_scheduler
        }

In [None]:
# Main to train the model
model = ElectraClassification(learning_rate=0.0001)

dm = ElectraClassificationDataModule(batch_size=8, train_path='/content/train.csv', valid_path='/content/val.csv',
                                max_length=256, sep=',', doc_col='Tweet', label_col='is_retweet', num_workers=1)

checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='val_accuracy',
                                                dirpath='./sample_electra_binary_nsmc_chpt',
                                                filename='ELECTRA/{epoch:02d}-{val_accuracy:.3f}',
                                                verbose=True,
                                                save_last=True,
                                                mode='max',
                                                save_top_k=-1,
                                                )

tb_logger = pl_loggers.TensorBoardLogger(os.path.join('./sample_electra_binary_nsmc_chpt', 'tb_logs'))

lr_logger = pl.callbacks.LearningRateMonitor()

trainer = pl.Trainer(
    default_root_dir='./sample_electra_binary_nsmc_chpt/checkpoints',
    logger = tb_logger,
    callbacks = [checkpoint_callback, lr_logger],
    max_epochs=3,
    gpus=1
)

trainer.fit(model, dm)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

FileNotFoundError: ignored

In [None]:
def infer(x, path) :
    model = ElectraClassification.load_from_checkpoint(path)
    tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
    
    # Clean the input text
    url_pattern = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    processed = url_pattern.sub(' ', x)
    processed = processed.replace('#', '')
    processed = processed.replace('@', '')
    processed = processed.strip()

    tokenized = tokenizer(processed, return_tensors='pt')
    output = model(tokenized.input_ids, tokenized.attention_mask)
    
    return nn.functional.softmax(output.logits, dim=-1)

In [None]:
text = 'Write some text to test classification'
print(infer(text,'/content/sample_electra_binary_nsmc_chpt/ELECTRA/epoch=02-val_accuracy=0.739.ckpt'))

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

tensor([[0.7317, 0.2683]], grad_fn=<SoftmaxBackward0>)
