In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
pip install datasets


In [None]:
import pandas as pd

# 파일 경로 (Google Drive 경로)
csv_file_path = '/content/drive/MyDrive/data_file/train.csv'

# 데이터 불러오기
data = pd.read_csv(csv_file_path)

# 데이터 확인
print(data.head())


In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

# 모델 이름 설정
model_name = "google/byt5-base"

# 모델 및 토크나이저 로드
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
# 데이터셋 나누기 (Hugging Face Datasets의 train_test_split 사용)
dataset = tokenized_datasets  # 전체 데이터셋
split = dataset.train_test_split(test_size=0.1, seed=42)

# 나눠진 데이터셋
dataset1 = split['train']
dataset2 = split['test']



In [None]:
import torch
from transformers import TrainerCallback, Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, AutoConfig

model_name = "google/byt5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Dropout 설정 강제 적용
model.config.dropout_rate = 0.4
model.config.attention_probs_dropout_prob = 0.4
model.config.activation_dropout = 0.4
print(model.config)



class CustomLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            step = state.global_step
            epoch = state.epoch

        
            lr = float(logs.get("learning_rate", 0.0))  # 기본값 0.0
            loss = float(logs.get("loss", 0.0))  # 기본값 0.0
            eval_loss = float(logs.get("eval_loss", 0.0)) if "eval_loss" in logs else "N/A"  # eval_loss는 N/A 허용
            grad_norm = float(logs.get("grad_norm", 0.0))  # 기본값 0.0

            print(f"Step {step} | Epoch {epoch:.2f} | Loss: {loss:.4f} | Eval Loss: {eval_loss} | LR: {lr:.6f} | Grad Norm: {grad_norm:.4f}")




new_output_dir = "/content/drive/MyDrive/byt5base_dropout" 


training_args = TrainingArguments(
    output_dir=new_output_dir,  
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    num_train_epochs=7,
    weight_decay=0.02,
    logging_steps=100,
    log_level="info",
    logging_dir="./logs",
    bf16=True, 
    gradient_accumulation_steps=2, 
    max_grad_norm=1.0,  
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset1,
    eval_dataset=dataset2,
    callbacks=[CustomLoggerCallback()]
)


trainer.train()
trainer.save_model(new_output_dir) 
trainer.state.save_to_json(os.path.join(new_output_dir, "trainer_state.json"))


dropout적용시킨후 lr도 올리기

In [None]:
import torch
from transformers import TrainerCallback, Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, AutoConfig

model_name = "google/byt5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Dropout 설정 강제 적용
model.config.dropout_rate = 0.4
model.config.attention_probs_dropout_prob = 0.4
model.config.activation_dropout = 0.4
print(model.config)



class CustomLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            step = state.global_step
            epoch = state.epoch
            lr = float(logs.get("learning_rate", 0.0))  # 기본값 0.0
            loss = float(logs.get("loss", 0.0))  # 기본값 0.0
            eval_loss = float(logs.get("eval_loss", 0.0)) if "eval_loss" in logs else "N/A"  # eval_loss는 N/A 허용
            grad_norm = float(logs.get("grad_norm", 0.0))  # 기본값 0.0

            print(f"Step {step} | Epoch {epoch:.2f} | Loss: {loss:.4f} | Eval Loss: {eval_loss} | LR: {lr:.6f} | Grad Norm: {grad_norm:.4f}")




new_output_dir = "/content/drive/MyDrive/byt5base_dropout3" \


training_args = TrainingArguments(
    output_dir=new_output_dir, \
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    num_train_epochs=7,
    weight_decay=0.02,
    logging_steps=100,
    log_level="info",
    logging_dir="./logs",
    bf16=True, 
    gradient_accumulation_steps=2,  
    max_grad_norm=1.0,  
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset1,
    eval_dataset=dataset2,
    callbacks=[CustomLoggerCallback()]
)


trainer.train()
trainer.save_model(new_output_dir)  # ✅ 새로운 폴더에 저장
trainer.state.save_to_json(os.path.join(new_output_dir, "trainer_state.json"))


In [None]:
import torch
from transformers import T5ForConditionalGeneration, ByT5Tokenizer
import pandas as pd
import os

# GPU 최적화 설정
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 데이터 로드
test_data_path = "/content/drive/MyDrive/data_file/test.csv"
if not os.path.exists(test_data_path):
    raise FileNotFoundError(f"파일을 찾을 수 없습니다: {test_data_path}")

df_test = pd.read_csv(test_data_path)
if "input" not in df_test.columns or "ID" not in df_test.columns:
    raise ValueError("`test.csv` 파일에 `input` 또는 `ID` 컬럼이 없습니다.")

# 모델 로드 및 BF16 설정
model_path = "/content/drive/MyDrive/byt5base_dropout3/checkpoint-12670"
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device).to(torch.bfloat16)
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-base")

# 모델 연산 최적화
model = torch.compile(model)
model.eval()

# 배치 크기 조정
batch_size = 8
total_size = len(df_test)

# 결과 저장 리스트
predictions = []

# 배치 단위로 처리 (Beam Search 적용)
for i in range(0, total_size, batch_size):
    batch_texts = df_test["input"].iloc[i: i + batch_size].tolist()

    inputs = tokenizer(
        batch_texts, return_tensors="pt", padding="longest", truncation=True, max_length=2096
    ).to(device)

    # BF16 적용 & Beam Search로 글로벌 최적해답 탐색
    with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.bfloat16):
        outputs = model.generate(
            **inputs,
            num_beams=5,                
            early_stopping=True,        
            length_penalty=1.0,         
            max_length=len(inputs["input_ids"][0])  # 입력과 동일한 길이 유지
        )

    # 결과 디코딩 및 저장
    batch_results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    predictions.extend(batch_results)

    # 진행 상황 출력
    progress = (i + batch_size) / total_size * 100
    if i % (batch_size * 10) == 0 or i + batch_size >= total_size:
        print(f"🚀 Progress: {progress:.2f}% completed.")

# 예측 결과 저장
df_test["output"] = predictions
submission_path = "/content/drive/MyDrive/submission11.csv"
df_test[["ID", "output"]].to_csv(submission_path, index=False, encoding="utf-8")

print(f"✅ Submission file saved successfully: {submission_path}")
