# 1. import

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset


# 2. Load Data

In [None]:
data = load_dataset("gsingh1-py/train")

human_df = pd.DataFrame({'text': data.iloc[:, 1], 'label': 0})
ai_series = data.iloc[:, 2:].stack().reset_index(drop=True)

# AI 데이터 샘플링 (Human 개수만큼)
ai_subset = ai_series.sample(n=len(human_df), random_state=42)
ai_df = pd.DataFrame({'text': ai_subset, 'label': 1})

# 데이터 합치기
final_data = pd.concat([human_df, ai_df], axis=0, ignore_index=True)
final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True)

# [중요] 결측치 제거 및 문자열 변환 (에러 방지 필수 단계)
final_data['text'] = final_data['text'].fillna("").astype(str)

print(f"전체 데이터 개수: {len(final_data)}")

# 2. Train/Test Split
# 변수명을 X_train_text로 명확히 하여 혼동 방지
X_remaining, X_test_text, y_remaining, y_test = train_test_split(
    final_data['text'].values, 
    final_data['label'].values, 
    test_size=0.2, 
    random_state=42
)

# 2차 분할: 나머지 80% 데이터를 [Train 60% : Validation 20%]로 나눔
# (남은 데이터의 1/4인 0.25를 떼어내면 전체의 20%가 됨)
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_remaining, 
    y_remaining, 
    test_size=0.25, 
    random_state=42
)
print(f"전체 데이터 개수: {len(final_data)}")
print(f"Train 개수 (60%): {len(X_train_text)}")
print(f"Valid 개수 (20%): {len(X_val_text)}")
print(f"Test  개수 (20%): {len(X_test_text)}")

전체 데이터 개수: 14642
전체 데이터 개수: 14642
Train 개수 (60%): 8784
Valid 개수 (20%): 2929
Test  개수 (20%): 2929


In [3]:
from transformers import DistilBertTokenizer
from tqdm import tqdm
distil_bert = 'distilbert-base-uncased'

# [수정 1] 초기화 시 불필요한 옵션(max_length 등) 제거 -> 에러 해결
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert, do_lower_case=True)

# [수정 2] maxlen 고정
# 실제 데이터 길이가 아니라 모델 한계(512) 내에서 설정해야 함. 
# 과제용으로는 128~256 추천 (속도/메모리 효율)
maxlen = 256

# [수정 3] 고속/메모리 효율적 토큰화 함수 (batch_encode_plus 사용)
def tokenize(sentences, tokenizer):
    encoded_dict = tokenizer.batch_encode_plus(
        sentences.tolist(),      # Numpy 배열을 리스트로 변환하여 전달
        add_special_tokens=True,
        max_length=maxlen,
        padding='max_length',    # maxlen에 맞춰 0으로 채움
        truncation=True,         # 넘치면 자름
        return_attention_mask=True,
        return_tensors='np'      # Numpy Array로 바로 반환 (메모리 절약)
    )
    
    # DistilBERT는 token_type_ids(segments)를 쓰지 않으므로 반환하지 않아도 됨
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

  from .autonotebook import tqdm as notebook_tqdm


## Pre-Processing

In [4]:
print("Tokenizing Train set...")
X_train_ids, X_train_masks = tokenize(X_train_text, tokenizer)

X_val_ids, X_val_masks = tokenize(X_val_text, tokenizer)
print(f"완료! Train shape: {X_train_ids.shape}, Label shape: {y_train.shape}")

Tokenizing Train set...
완료! Train shape: (8784, 256), Label shape: (8784,)


In [5]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# 1. GPU가 있는지 확인하고 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

batch_size = 16

train_data = TensorDataset(
    torch.tensor(X_train_ids), 
    torch.tensor(X_train_masks), 
    torch.tensor(y_train)
)
validation_data = TensorDataset(
    torch.tensor(X_val_ids), 
    torch.tensor(X_val_masks), 
    torch.tensor(y_val)
)

# 4. 데이터 로더 생성 (배달원)
# 학습 때는 데이터를 섞어야(Shuffle) 편향되지 않습니다.
train_dataloader = DataLoader(
    train_data, 
    sampler=RandomSampler(train_data), 
    batch_size=batch_size
)

# 평가 때는 섞을 필요가 없습니다.
validation_dataloader = DataLoader(
    validation_data, 
    sampler=SequentialSampler(validation_data), 
    batch_size=batch_size
)


print("데이터 로더 준비 완료!")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3050 4GB Laptop GPU
데이터 로더 준비 완료!


In [6]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## Naive Baseline implementation


In [None]:
def naive_baseline(sentence):
    sentence = sentence.lower() # 소문자로 변환
    if any(word in sentence for word in ['**', '##', 'title']):
        return 1 # AI 라벨
    else:
        # if not, randomly choose between Human(0) and AI(1)
        return 0

In [19]:
# 5. 나이브 베이스라인 평가
naive_preds = [naive_baseline(sent) for sent in X_test_text]
from sklearn.metrics import accuracy_score
print("나이브 베이스라인 평가 결과:")
print(f"정확도: {accuracy_score(y_test, naive_preds):.4f}")

나이브 베이스라인 평가 결과:
정확도: 0.9249


## AI PIPELINE IMPLEMENTATION

In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers import DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

epochs = 3
learning_rate = 2e-5
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2, # 0: Human, 1: AI
    output_attentions=False,
    output_hidden_states=False,
)

model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training (Model use)

In [11]:

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    print(f'\n======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    model.train() # 학습 모드 설정
    total_train_loss = 0

    # 배치 단위로 학습 진행
    for batch in tqdm(train_dataloader):
        # 1. 배치를 GPU로 이동
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # 2. 그래디언트 초기화
        model.zero_grad()        

        # 3. Forward Pass (모델 예측)
        output = model(
            b_input_ids, 
            attention_mask=b_input_mask, 
            labels=b_labels
        )
        
        loss = output.loss
        total_train_loss += loss.item()

        # 4. Backward Pass (역전파)
        loss.backward()

        # 5. 파라미터 업데이트
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 그래디언트 폭주 방지
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"  Average training loss: {avg_train_loss:.2f}")
    # ========================================
    #               Validation
    # ========================================
    print("Running Validation...")

    model.eval() # 평가 모드 설정 (Dropout 끔)
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad(): # 평가 때는 그래디언트 계산 안 함 (메모리 절약)
            output = model(
                b_input_ids, 
                attention_mask=b_input_mask, 
                labels=b_labels
            )
            
        loss = output.loss
        logits = output.logits

        # GPU에 있는 데이터를 CPU로 옮겨서 정확도 계산
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_loss += loss.item()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    print(f"  Validation Accuracy: {total_eval_accuracy/len(validation_dataloader):.2f}")

print("\nTraining complete!")


Training...


100%|██████████| 549/549 [09:24<00:00,  1.03s/it]


  Average training loss: 0.04
Running Validation...
  Validation Accuracy: 1.00

Training...


100%|██████████| 549/549 [15:38<00:00,  1.71s/it]


  Average training loss: 0.00
Running Validation...
  Validation Accuracy: 1.00

Training...


100%|██████████| 549/549 [15:42<00:00,  1.72s/it]


  Average training loss: 0.00
Running Validation...
  Validation Accuracy: 1.00

Training complete!


In [12]:
print("Tokenizing Test set...")
X_test_ids, X_test_masks = tokenize(X_test_text, tokenizer)

test_data = TensorDataset(
    torch.tensor(X_test_ids), 
    torch.tensor(X_test_masks), 
    torch.tensor(y_test)
)
test_dataloader = DataLoader(
    test_data, 
    sampler=SequentialSampler(test_data), 
    batch_size=16  # 학습 때 쓴 batch_size와 같거나 달라도 상관없습니다.
)

Tokenizing Test set...


In [13]:
print("\n======== 최종 테스트 데이터 평가 (Final Test) ========")

model.eval() # 모델을 평가 모드로 전환 (Dropout, BatchNorm 등이 고정됨)

total_test_accuracy = 0
total_test_loss = 0
all_predictions = []
all_true_labels = []

for batch in test_dataloader:
    # 데이터를 GPU로 이동
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    # 그래디언트 계산 안 함 (메모리 절약, 속도 향상)
    with torch.no_grad():
        output = model(
            b_input_ids, 
            attention_mask=b_input_mask, 
            labels=b_labels
        )
    
    loss = output.loss
    logits = output.logits

    # CPU로 데이터 이동 (Numpy 변환을 위해)
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # 결과 누적
    total_test_loss += loss.item()
    total_test_accuracy += flat_accuracy(logits, label_ids)
    
    # (선택사항) 나중에 혼동 행렬 등을 그리기 위해 예측값 저장
    all_predictions.extend(np.argmax(logits, axis=1).flatten())
    all_true_labels.extend(label_ids.flatten())

# 3. 최종 결과 계산 및 출력
avg_test_accuracy = total_test_accuracy / len(test_dataloader)
avg_test_loss = total_test_loss / len(test_dataloader)

print(f"  Final Test Loss: {avg_test_loss:.4f}")
print(f"  Final Test Accuracy: {avg_test_accuracy:.4f}")
print("====================================================")


  Final Test Loss: 0.0001
  Final Test Accuracy: 1.0000


In [14]:
def show_qualitative_examples(X_text, y_true, naive_preds, model_preds, tokenizer, model, device, count=3):
    """
    베이스라인과 AI 모델의 예측이 서로 다른 흥미로운 사례를 찾아 출력합니다.
    """
    print(f"\n{'='*20} Qualitative Analysis (Method Comparison) {'='*20}")
    
    # 1. 두 모델의 예측이 다른 인덱스 찾기 (가장 분석 가치가 높음)
    diff_indices = [i for i in range(len(y_true)) if naive_preds[i] != model_preds[i]]
    
    # 만약 다른 경우가 별로 없다면, AI 모델이 틀린 경우를 추가로 찾음
    if len(diff_indices) < count:
        wrong_indices = [i for i in range(len(y_true)) if model_preds[i] != y_true[i]]
        diff_indices.extend(wrong_indices)
        # 중복 제거 및 앞에서부터 필요한 만큼 자르기
        diff_indices = list(set(diff_indices))
    
    # 그래도 부족하면 그냥 앞에서부터 채움
    if len(diff_indices) < count:
        diff_indices = list(range(count))
        
    selected_indices = diff_indices[:count]
    
    label_map = {0: "Human", 1: "AI"}
    
    for i, idx in enumerate(selected_indices):
        text_sample = X_text[idx]
        true_lbl = y_true[idx]
        naive_lbl = naive_preds[idx]
        ai_lbl = model_preds[idx]
        
        # AI 모델의 확신도(Probability) 계산을 위해 다시 Inference 수행
        inputs = tokenizer(text_sample, return_tensors="pt", truncation=True, max_length=256, padding="max_length")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()[0]
        
        print(f"\n[Example Case #{i+1}] Index: {idx}")
        print(f"Text (Snippet): \"{text_sample[:150]}...\"") # 텍스트가 너무 길면 자름
        print(f"-" * 60)
        print(f"Ground Truth   : {label_map[true_lbl]} ({true_lbl})")
        print(f"Naive Baseline : {label_map[naive_lbl]} ({naive_lbl}) \t <- {'Correct' if naive_lbl == true_lbl else 'Wrong'}")
        print(f"AI Pipeline    : {label_map[ai_lbl]} ({ai_lbl}) \t <- {'Correct' if ai_lbl == true_lbl else 'Wrong'}")
        print(f"AI Confidence  : Human {probs[0]:.4f} vs AI {probs[1]:.4f}")
        
        # 짧은 코멘트 생성 (보고서 작성용 힌트)
        print(f"Discussion Hint: ", end="")
        if naive_lbl != ai_lbl and ai_lbl == true_lbl:
            print("The AI model successfully captured nuances that the rule-based baseline missed.")
        elif naive_lbl == true_lbl and ai_lbl != true_lbl:
            print("The AI model failed on this example, while the simple baseline got it right.")
        else:
            print("Both models showed similar performance on this difficult/easy example.")
        print("="*70)

In [15]:
naive_preds_test = [naive_baseline(sent) for sent in X_test_text]

# 2. 함수 실행
# all_predictions는 이전 셀(Final Test)에서 생성된 AI 모델의 예측값 리스트입니다.
# all_true_labels는 y_test와 동일합니다.
show_qualitative_examples(
    X_text=X_test_text, 
    y_true=y_test, 
    naive_preds=naive_preds_test, 
    model_preds=all_predictions,  # 이전 셀의 결과 변수
    tokenizer=tokenizer,
    model=model,
    device=device,
    count=3
)




[Example Case #1] Index: 35
Text (Snippet): " 400000 A Tale of Three Housing Markets

New York Mississippi Maryland three states one price point vastly different realities 

The median home price..."
------------------------------------------------------------
Ground Truth   : AI (1)
Naive Baseline : Human (0) 	 <- Wrong
AI Pipeline    : AI (1) 	 <- Correct
AI Confidence  : Human 0.0001 vs AI 0.9999
Discussion Hint: The AI model successfully captured nuances that the rule-based baseline missed.

[Example Case #2] Index: 47
Text (Snippet): "Error: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Cau..."
------------------------------------------------------------
Ground Truth   : AI (1)
Naive Baseline : Human (0) 	 <- Wrong
AI Pipeline    : AI (1) 	 <- Correct
AI Confidence  : Human 0.0002 vs AI 0.9998
Discussion Hint: The AI model successfully captured nuances that the rule-based baseline misse