In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 데이터 로딩
df_train = pd.read_csv("./train.csv", encoding="utf-8-sig")
df_test = pd.read_csv("./test.csv", encoding="utf-8-sig")

# 토크나이저 로드
tokenizer = T5Tokenizer.from_pretrained("t5-small", cache_dir="./cache")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
# 데이터셋 정의
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input']
        target_text = self.data.iloc[idx]['output']
        
        inputs = self.tokenizer(input_text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        targets = self.tokenizer(target_text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

In [4]:
# 데이터 로더 설정
dataset = TextDataset(df_train, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# 모델 로드
model = T5ForConditionalGeneration.from_pretrained("t5-small")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [5]:
# 학습 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # 평가 데이터셋이 없으므로 비활성화
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
    fp16=torch.cuda.is_available()  # GPU 가속 활용 여부 자동 설정
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=None,  # 평가 데이터셋이 없음을 명시
    tokenizer=tokenizer
)

# 모델 학습
trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


KeyboardInterrupt: 

In [6]:
# 테스트 데이터 예측
def generate_predictions(test_df, model, tokenizer):
    model.eval()
    predictions = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for text in test_df["input"]:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        output = model.generate(**inputs, max_length=128)
        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(decoded_output)
    return predictions


In [7]:
# 결과 저장
df_test["output"] = generate_predictions(df_test, model, tokenizer)
df_test.to_csv("./submission.csv", index=False, encoding="utf-8-sig")
