In [None]:
import os
import pandas as pd
import numpy as np
import torch
import warnings
import gc
import csv
import random
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, f1_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback 
)
from datasets import Dataset

warnings.filterwarnings('ignore')

# ===========
# [설정 변수]
# ===========
BASE_DIR = '/'
DATA_DIR = '/'
MODEL_NAME = "klue/roberta-small"
OUTPUT_DIR = os.path.join(BASE_DIR, 'klue_roberta_small_result')
TRAIN_FILE_NAME = 'train_fixed.csv'  
VAL_FILE_NAME = 'valid_fixed.csv'      
TEST_FILE_NAME = 'local_test_fixed.csv'

MAX_LEN = 512
BATCH_SIZE = 128    
EPOCHS = 10         
LEARNING_RATE = 5e-5 
PATIENCE = 3      
SEED = 42           

detected_delimiter = ','
detected_quotechar = '"'

# ==============
# [유틸리티 함수]
# ==============

def set_seeds(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"random seed set to {seed}")

def find_column_name(columns, candidates):
    for col in columns:
        if col.lower().strip() in candidates:
            return col
    return None

def load_and_fix_data(path, is_test=False):
    if not os.path.exists(path):
        print(f"파일이 없습니다: {path}")
        return None

    df = None
    encodings_to_try = ['utf-8-sig', 'utf-8', 'cp949']

    for encoding in encodings_to_try:
        try:
            df = pd.read_csv(
                path,
                encoding=encoding,
                engine='python',
                on_bad_lines='skip',       
                encoding_errors='ignore',  
                delimiter=detected_delimiter,
                quotechar=detected_quotechar,
                quoting=csv.QUOTE_MINIMAL
            )
            break
        except Exception as e:
            df = None

    if df is None:
        print(f"데이터 로드 실패: {path}")
        return None

    text_candidates = ['paragraph_text', 'text', 'sentence', 'content', 'full_text']
    text_col = find_column_name(df.columns, text_candidates)
    if text_col:
        df.rename(columns={text_col: 'text'}, inplace=True)
    else:
        obj_cols = df.select_dtypes(include=['object']).columns
        if len(obj_cols) > 0:
            df.rename(columns={obj_cols[0]: 'text'}, inplace=True)
        else:
            return None

    if is_test:
        id_candidates = ['id', 'idx', 'index', 'no', 'ID']
        id_col = find_column_name(df.columns, id_candidates)
        if id_col:
            df.rename(columns={id_col: 'id'}, inplace=True)
        else:
            df['id'] = df.index

    if not is_test:
        target_candidates = ['generated', 'label', 'target', 'class']
        target_col = find_column_name(df.columns, target_candidates)
        if target_col:
            df.rename(columns={target_col: 'label'}, inplace=True)
            try:
                df['label'] = df['label'].astype(int)
            except:
                pass
        else:
            print("타겟(Label) 컬럼을 찾을 수 없습니다.")
            return None

    return df

def run_process():
    set_seeds(SEED)
    
    torch.cuda.empty_cache()
    gc.collect()

    print(f"\n[{MODEL_NAME}] 학습 프로세스 시작 (Fixed Dataset Mode)")

    if not os.path.exists(DATA_DIR):
        print(f"'{DATA_DIR}' 경로가 없음 / 실행")
        current_data_dir = '/'
    else:
        current_data_dir = DATA_DIR

    train_path = os.path.join(current_data_dir, TRAIN_FILE_NAME)
    val_path = os.path.join(current_data_dir, VAL_FILE_NAME)
    test_path = os.path.join(current_data_dir, TEST_FILE_NAME)

    if not (os.path.exists(train_path) and os.path.exists(val_path) and os.path.exists(test_path)):
        print(f"데이터셋 파일을 찾을 수 없습니다.")
        print(f"   확인 경로:\n   - {train_path}\n   - {val_path}\n   - {test_path}")
        print("   파일명 설정 변수를 확인하거나 데이터 분할 코드를 실행하세요.")
        return

    print(f">>> 데이터셋 로드 중...")
    print(f"   Train: {TRAIN_FILE_NAME}")
    print(f"   Valid: {VAL_FILE_NAME}")
    print(f"   Test : {TEST_FILE_NAME}")

    # 데이터 로드
    train_df = load_and_fix_data(train_path, is_test=False)
    val_df = load_and_fix_data(val_path, is_test=False)
    test_df = load_and_fix_data(test_path, is_test=False) 

    if train_df is None or val_df is None or test_df is None:
        print("데이터 로드 중 오류 발생. 종료합니다.")
        return

    print(f" - Train Set : {len(train_df)}개")
    print(f" - Valid Set : {len(val_df)}개")
    print(f" - Test Set  : {len(test_df)}개")

    # Dataset 생성
    train_ds = Dataset.from_pandas(train_df[['text', 'label']])
    val_ds = Dataset.from_pandas(val_df[['text', 'label']])
    test_ds = Dataset.from_pandas(test_df[['text', 'label']])

    print(f">>> 토크나이저 로드 ({MODEL_NAME})...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def preprocess(examples):
        return tokenizer(examples["text"], truncation=True, max_length=MAX_LEN, padding=False)

    encoded_train = train_ds.map(preprocess, batched=True)
    encoded_val = val_ds.map(preprocess, batched=True)
    encoded_test = test_ds.map(preprocess, batched=True)

    print(f">>> 모델 로드 ({MODEL_NAME})...")
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        eval_strategy="epoch",     
        save_strategy="epoch",     
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        fp16=True,
        load_best_model_at_end=True, 
        metric_for_best_model="f1",  
        greater_is_better=True,      
        save_total_limit=2,       
        report_to="none",
        seed=SEED
    )

    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=1)
        labels = p.label_ids
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average='macro')
        return {"accuracy": acc, "f1": f1}

    trainer = Trainer(
        model=model, args=args,
        train_dataset=encoded_train, eval_dataset=encoded_val,
        tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
    )

    print(">>> 학습 시작...")
    trainer.train()
    
    print(f">>> Best 모델 저장 중({OUTPUT_DIR})...")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    print(">>> 최종 성능 평가 (Test Set)...")
    metrics = trainer.evaluate(encoded_test)
    print("\n" + "="*40)
    print(f"Final Test Accuracy : {metrics['eval_accuracy']:.4f}")
    print(f"Final Test F1 Score : {metrics['eval_f1']:.4f}")
    print("="*40 + "\n")

    print(">>> 상세 예측 리포트 생성 중...")
    preds_output = trainer.predict(encoded_test)
    

    print(">>> 앙상블/상관계수 분석용 CSV 생성 중...")
    logits = torch.tensor(preds_output.predictions)
    probs = torch.nn.functional.softmax(logits, dim=-1)
    prob_class_1 = probs[:, 1].numpy()
    
    correlation_df = test_df.copy()
    correlation_df['prob_class_1'] = prob_class_1
    correlation_df['pred_label'] = np.argmax(preds_output.predictions, axis=1)
    
    corr_save_path = os.path.join(OUTPUT_DIR, 'preds_for_correlation.csv')
    correlation_df.to_csv(corr_save_path, index=False)
    print(f"저장 완료: {corr_save_path}")

    pred_labels = np.argmax(preds_output.predictions, axis=1)
    true_labels = test_df['label'].values
    
    print(classification_report(true_labels, pred_labels, target_names=['Class 0', 'Class 1']))

    print(">>> 오답 분석용 CSV 저장 중...")
    test_df['predicted'] = pred_labels
    wrong_df = test_df[test_df['label'] != test_df['predicted']]
    wrong_save_path = '/content/wrong_predictions.csv'
    wrong_df.to_csv(wrong_save_path, index=False)
    print(f"   오답 데이터 {len(wrong_df)}개 저장 완료: {wrong_save_path}")

if __name__ == "__main__":
    if torch.cuda.is_available():
        print(f"GPU 연결 성공: {torch.cuda.get_device_name(0)}")
        run_process()
    else:
        print("GPU 감지가 안됨.")

**submission 생성**

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset

BASE_DIR = '/model'
DATA_DIR = '/Data'
SAVED_MODEL_NAME = 'klue_roberta_small_result'
OUTPUT_DIR = os.path.join(BASE_DIR, SAVED_MODEL_NAME)
save_csv_path = '/temp_submission.csv'

MAX_LEN = 512
BATCH_SIZE = 64

print(f"모델 로드 경로: {OUTPUT_DIR}")
saved_model_path = OUTPUT_DIR
test_data_path = os.path.join(DATA_DIR, 'test.csv')

if not os.path.exists(test_data_path):
    print(f"'{test_data_path}' 파일이 없음")
    test_data_path = '/test.csv'

test_df = load_and_fix_data(test_data_path, is_test=True)

if test_df is not None:
    print(f"테스트 데이터 로드 성공: {len(test_df)}행")
    
    try:
        loaded_model = AutoModelForSequenceClassification.from_pretrained(saved_model_path)
        loaded_tokenizer = AutoTokenizer.from_pretrained(saved_model_path)
        print("모델 및 토크나이저 로드 성공")
    except Exception as e:
        print(f"모델 로드 실패: {e}")
        print("   -> 경로 및 모델 파일이 존재하는지 확인.")
        loaded_model = None

    if loaded_model:
        test_ds = Dataset.from_pandas(test_df[['text']])

        def token_func(examples):
            return loaded_tokenizer(examples["text"], truncation=True, max_length=MAX_LEN, padding=False)

        encoded_test = test_ds.map(token_func, batched=True)

        temp_inference_dir = os.path.join(BASE_DIR, 'temp_inference')
        
        inference_args = TrainingArguments(
            output_dir=temp_inference_dir,
            per_device_eval_batch_size=BATCH_SIZE,
            fp16=True,
            report_to="none"
        )

        inference_trainer = Trainer(
            model=loaded_model,
            args=inference_args,
            tokenizer=loaded_tokenizer,
            data_collator=DataCollatorWithPadding(loaded_tokenizer)
        )

        print(">>> 예측 수행 중...")
        pred_output = inference_trainer.predict(encoded_test)

        logits = torch.tensor(pred_output.predictions)
        probs = torch.nn.functional.softmax(logits, dim=-1)
        prob_class_1 = probs[:, 1].numpy() 

        submission = pd.DataFrame({
            'ID': test_df['id'],  
            'generated': prob_class_1
        })

        submission.to_csv(save_csv_path, index=False)

        print(f"\n파일 생성 완료: {save_csv_path}")
        print(submission.head())

        try:
            from google.colab import files
            files.download(save_csv_path)
        except:
            print("자동 다운로드 실패")
            pass
else:
    print("test 데이터 로드 실패")