# Dataset and DataModule V01

## 0. imports

In [1]:
%load_ext lab_black

In [2]:
import sys

sys.path.append("..")

In [3]:
import os
import numpy as np
import pandas as pd

from tqdm import tqdm

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
import transformers

from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from kobart import get_pytorch_kobart_model, get_kobart_tokenizer

from sklearn.model_selection import train_test_split

## 1. Dataset

In [5]:
class SummaryDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: transformers.PreTrainedTokenizerFast,
        max_seq_len: int = 512,
        phase: str = "train",
    ):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.phase = phase

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        document, summary = item["total"], item["summary"]  # TODO: 데이터에 맞게끔 변경

        encoder_input_id, encoder_attention_mask = self.encode_and_pad(document)

        if self.phase in ["train", "valid"]:
            decoder_input_id, decoder_attention_mask = self.encode_and_pad(summary)

            output_id = self.tokenizer.encode(summary)
            output_id += [self.tokenizer.eos_token_id]
            if len(output_id) < self.max_seq_len:
                pad_len = self.max_seq_len - len(output_id)
                output_id += [-999999] * pad_len
            else:
                output_id = output_id[: self.max_seq_len - 1] + [
                    self.tokenizer.eos_token_id
                ]
            return {
                "input_ids": np.array(encoder_input_id, dtype=np.int_),
                "attention_mask": np.array(encoder_attention_mask, dtype=np.float32),
                "decoder_input_ids": np.array(decoder_input_id, dtype=np.int_),
                "decoder_attention_mask": np.array(
                    decoder_attention_mask, dtype=np.float32
                ),
                "labels": np.array(output_id, dtype=np.int_),
            }
        else:
            return {
                "input_ids": np.array(encoder_input_id, dtype=np.int_),
                "attention_mask": np.array(encoder_attention_mask, dtype=np.float32),
            }

    def encode_and_pad(self, text: str):
        # token_to_id
        # encoder_input_id = self.tokenizer.encode(document)
        tokens = (
            [self.tokenizer.bos_token]
            + self.tokenizer.tokenize(text)
            + [self.tokenizer.eos_token]
        )
        input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)

        # padding
        if len(input_id) < self.max_seq_len:
            pad_len = self.max_seq_len - len(input_id)
            input_id += [self.tokenizer.pad_token_id] * pad_len
            attention_mask += [0] * pad_len
        else:
            input_id = input_id[: self.max_seq_len - 1] + [self.tokenizer.eos_token_id]
            attention_mask = attention_mask[: self.max_seq_len]
        return input_id, attention_mask

In [7]:
# train_df = pd.read_csv("../data/train.csv")

train_path = "../data/train.csv"
tokenizer_path = "../kobart"
max_seq_len = 512
phase = "train"

train = pd.read_csv(train_path)
tokenizer = get_kobart_tokenizer(tokenizer_path)

train_dataset = SummaryDataset(
    data=train,
    tokenizer=tokenizer,
    max_seq_len=max_seq_len,
    phase=phase,
)

using cached model


In [8]:
sample = train_dataset[100]
# sample

## 2. DataModule

In [18]:
class SummaryDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_path: str,
        test_path: str,
        tokenizer_path: str,
        max_seq_len: int,
        valid_size: float = 0.2,
        batch_size: int = 8,
        num_workers=4,
    ):
        super().__init__()

        self.train_path = train_path
        self.test_path = test_path
        self.tokenizer_path = tokenizer_path
        self.max_seq_len = max_seq_len
        self.valid_size = valid_size
        self.batch_size = batch_size
        self.num_workers = num_workers

    def setup(self, stage=None):
        # load data & tokenizer
        train = pd.read_csv(self.train_path)
        test = pd.read_csv(self.test_path)
        tokenizer = get_kobart_tokenizer(self.tokenizer_path)

        # split train/valid
        train, valid = train_test_split(train, test_size=self.valid_size, shuffle=True)

        # train/valid/test Dataset
        self.trainset = SummaryDataset(
            train, tokenizer, self.max_seq_len, phase="train"
        )
        self.validset = SummaryDataset(
            valid, tokenizer, self.max_seq_len, phase="valid"
        )
        self.testset = SummaryDataset(valid, tokenizer, self.max_seq_len, phase="test")

    def train_dataloader(self):
        return DataLoader(self.trainset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.validset, batch_size=self.batch_size, shuffle=True)

    def test_dataloader(self):
        return DataLoader(self.testset, batch_size=self.batch_size, shuffle=True)

In [19]:
train_path = "../data/train.csv"
test_path = "../data/test.csv"
tokenizer_path = "../kobart"
max_seq_len = 512
valid_size = 0.2
batch_size = 2
num_workers = 4

data_module = SummaryDataModule(
    train_path=train_path,
    test_path=test_path,
    tokenizer_path=tokenizer_path,
    max_seq_len=max_seq_len,
    valid_size=valid_size,
    batch_size=batch_size,
    num_workers=num_workers,
)

In [20]:
data_module.setup()
train_loader = data_module.train_dataloader()
valid_loader = data_module.val_dataloader()
test_loader = data_module.test_dataloader()

using cached model


In [25]:
for batch in test_loader:
    batch = batch
    break

In [26]:
batch.keys()

dict_keys(['input_ids', 'attention_mask'])