In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split

2023-09-18 01:40:54.605699: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class NewsDataset(Dataset):
    def __init__(self, dataset):
        self.labels = dataset['label']
        self.input_ids = dataset['input_ids']
        self.attention_mask = dataset['attention_mask']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        input_ids = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]
        return {"label": label, "input_ids":input_ids, "attention_mask":attention_mask}
    

def load_data(trainset_file_path, testset_file_path):
    train_data = pd.read_csv(trainset_file_path, encoding='utf-8')
    test_data = pd.read_csv(testset_file_path, encoding='utf-8')
    
    return train_data, test_data

def preprocessing_data(dataset, tokenizer, max_length):
    concat_entity = []
    for title, body in zip(dataset['Headline'], dataset['Content']):
        total = title + '[SEP]' + body
        concat_entity.append(total)
        
    tokenized_senteneces = tokenizer(
        concat_entity,
        return_tensors = "pt",
        padding = True,
        truncation = True,
        max_length = max_length,
        add_special_tokens = True,
        return_token_type_ids=False,
    )

    input_ids = tokenized_senteneces.input_ids
    attention_mask = tokenized_senteneces.attention_mask
    label = torch.tensor(dataset['Class'])
    
    return {
        "label": label,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
    }

def get_dataloader(train_path, test_path, tokenizer, MAX_LEN, BATCH_SIZE):
    # 데이터 파일 로드
    train_data, test_data = load_data(train_path, test_path)
    
    # 토크나이징 & 데이터셋 로드
    tokenized_train = NewsDataset(preprocessing_data(train_data, tokenizer, MAX_LEN))
    tokenized_test = NewsDataset(preprocessing_data(test_data, tokenizer, MAX_LEN))
    
    # train, valid split
    generator = torch.Generator().manual_seed(RANDOM_SEED)
    train, valid = random_split(tokenized_train, [0.8, 0.2], generator=generator)
    
    # DataLoader
    train_dataloader = DataLoader(train, sampler=RandomSampler(train), batch_size=BATCH_SIZE, num_workers=8)
    valid_dataloader = DataLoader(valid, sampler=SequentialSampler(valid), batch_size=BATCH_SIZE, num_workers=8)
    test_dataloader = DataLoader(tokenized_test, sampler=SequentialSampler(tokenized_test), batch_size=BATCH_SIZE, num_workers=8)
    
    return train_dataloader, valid_dataloader, test_dataloader

In [3]:
import lightning.pytorch as pl
from collections import OrderedDict
from sklearn.metrics import accuracy_score, precision_score

In [4]:
class Model(pl.LightningModule):

    def __init__(self, MODEL_NAME):
        super(Model, self).__init__()
        model_config = AutoConfig.from_pretrained(MODEL_NAME)
        model_config.num_labels = 2
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)
        self.model = model
        
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
                self.model.parameters(),
                lr=2e-5,
                )
        return optimizer

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        output = self.model(
                input_ids,
                token_type_ids=None,
                attention_mask=attention_mask,
                labels=labels
                )
        
        loss = output['loss']
        
        tqdm_dict = {"train_loss": loss}
        output = OrderedDict({
            "loss": loss,
            "progress_bar": tqdm_dict,
            "log": tqdm_dict
            })
        
        self.log('train_loss', loss)

        return output

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        output = self.model(
                input_ids,
                token_type_ids=None,
                attention_mask=attention_mask,
                labels=labels
                )
        loss = output['loss']
        logits = output['logits']
        labels_hat = torch.argmax(logits, dim=1)

        logits = logits.detach().cpu().numpy()
        labels = labels.to('cpu').numpy()

        correct_count = torch.sum(labels == labels_hat)

        acc = accuracy_score(labels, labels_hat)
        precision = precision_score(labels, labels_hat)
        
        if self.on_gpu:
            correct_count = correct_count.cuda(loss.device.index)

        output = OrderedDict({
            "val_loss": loss,
            "correct_count": correct_count,
            "batch_size": len(labels)
            })
        self.log('val_loss', loss)
        self.log('val_accuracy', acc)
        self.log('val_precision', precision)
        return output

    def validation_end(self, outputs):
        val_acc = sum([out["correct_count"] for out in outputs]).float() / sum(out["batch_size"] for out in outputs)
        val_loss = sum([out["val_loss"] for out in outputs]) / len(outputs)
        tqdm_dict = {
                "val_loss": val_loss,
                "val_acc": val_acc,
                }
        result = {"progress_bar": tqdm_dict, "log": tqdm_dict, "val_loss": val_loss}
        return result

    def test_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        output = self.model(
                input_ids,
                token_type_ids=None,
                attention_mask=attention_mask,
                labels=labels
                )
        loss = output['loss']
        logits = output['logits']
        
        labels_hat = torch.argmax(logits, dim=1)

        correct_count = torch.sum(labels == labels_hat)
        
        logits = logits.detach().cpu().numpy()
        labels = labels.to('cpu').numpy()

        correct_count = torch.sum(labels == labels_hat)

        acc = accuracy_score(labels, labels_hat)
        precision = precision_score(labels, labels_hat)

        if self.on_gpu:
            correct_count = correct_count.cuda(loss.device.index)

        output = OrderedDict({
            "test_loss": loss,
            "correct_count": correct_count,
            "batch_size": len(labels)
            })

        return output

    def test_end(self, outputs):
        test_acc = sum([out["correct_count"] for out in outputs]).float() / sum(out["batch_size"] for out in outputs)
        test_loss = sum([out["test_loss"] for out in outputs]) / len(outputs)
        tqdm_dict = {
                "test_loss": test_loss,
                "test_acc": test_acc,
                }
        result = {"progress_bar": tqdm_dict, "log": tqdm_dict}
        return result

In [5]:
model = Model('bert-base-multilingual-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
MAX_LEN = 450
BATCH_SIZE = 8
RANDOM_SEED = 42

In [7]:
train_dataloader, valid_dataloader, test_dataloader = get_dataloader('data_EC_00.csv', 'VL_data_EC.csv', tokenizer, MAX_LEN, BATCH_SIZE)

In [8]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [9]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33m8chatea8[0m ([33mmahimahi[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import WandbLogger

In [11]:
wandb_logger = WandbLogger(project="mahimahi")
early_stop_callback = EarlyStopping(
    monitor='val_loss',
    patience=2,
    verbose=True,
    mode='min'
)
model_checkpoint = ModelCheckpoint(
    dirpath='model/checkpoint',
)

callbacks = [early_stop_callback, model_checkpoint]
trainer = pl.Trainer(logger=wandb_logger,
                     check_val_every_n_epoch=1,
                     max_epochs=10)
trainer.fit(model, train_dataloader, valid_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 177 M 
--------------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.420   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

TypeError: sum(): argument 'input' (position 1) must be Tensor, not bool

In [None]:
torch.cuda.empty_cache()