# Setting

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

In [None]:
class TextDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_length):
        self.data = self.load_data(data_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def load_data(self, data_path):
        with open(data_path, 'r', encoding='utf-8') as f:
            dataset = json.load(f)
        return [(item['text'], item['label']) for item in dataset]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [1]:
def text_loader(data_path, tokenizer_name, batch_size, max_length, num_workers=4):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    dataset = TextDataset(data_path, tokenizer, max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

In [None]:
DATASET = 'text_data.json'  # 텍스트 데이터 경로
TOKENIZER_NAME = 'bert-base-uncased'  # tokenizer 종류 이거 쓰면 되나??
BATCH_SIZE = 64
MAX_LENGTH = 128

In [None]:
train_text_loader = text_loader(DATASET, TOKENIZER_NAME, BATCH_SIZE, MAX_LENGTH)