In [2]:
%pip install torch
%pip install transformers
%pip install pandas
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd



In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
# 데이터 로딩 및 전처리
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

In [8]:
# BERT 토크나이저 로딩
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# 데이터 파일 경로 설정
file_path = '/kaggle/input/llm-detect-ai-generated-text/train_essays.csv'

# 데이터 로딩
df = load_data(file_path)

# 데이터 분리
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
# 하이퍼파라미터 설정
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5

# 데이터셋 및 데이터로더 생성
train_dataset = CustomDataset(
    texts=train_df['text'].values,
    labels=train_df['generated'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = CustomDataset(
    texts=val_df['text'].values,
    labels=val_df['generated'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [13]:
# 모델 초기화
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# 옵티마이저 및 손실 함수 설정
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# 모델 학습
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # 정확도 계산
        logits = outputs.logits
        _, predicted = torch.max(logits, dim=1)
        total_correct += torch.sum(predicted == labels).item()
        total_samples += labels.size(0)

        loss.backward()
        optimizer.step()

        # 배치마다 진행 상황 출력
        if batch_idx % 10 == 0:
            print(f'Epoch {epoch + 1}/{EPOCHS}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item():.4f}')

    avg_loss = total_loss / len(train_loader)
    accuracy = total_correct / total_samples
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

Epoch 1/3, Batch 0/35, Loss: 0.2584
Epoch 1/3, Batch 10/35, Loss: 0.0590
Epoch 1/3, Batch 20/35, Loss: 0.0339
Epoch 1/3, Batch 30/35, Loss: 0.0166
Epoch 1/3, Loss: 0.0585, Accuracy: 0.9982
Epoch 2/3, Batch 0/35, Loss: 0.0123
Epoch 2/3, Batch 10/35, Loss: 0.0069
Epoch 2/3, Batch 20/35, Loss: 0.0052
Epoch 2/3, Batch 30/35, Loss: 0.0048
Epoch 2/3, Loss: 0.0156, Accuracy: 0.9982
Epoch 3/3, Batch 0/35, Loss: 0.0035
Epoch 3/3, Batch 10/35, Loss: 0.0035
Epoch 3/3, Batch 20/35, Loss: 0.0031
Epoch 3/3, Batch 30/35, Loss: 0.0035
Epoch 3/3, Loss: 0.0125, Accuracy: 0.9982


In [15]:
# 모델 평가
model.eval()
val_predictions = []
val_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        _, predicted = torch.max(logits, dim=1)

        val_predictions.extend(predicted.cpu().numpy())
        val_labels.extend(labels.cpu().numpy())


In [16]:
# 정확도 출력
accuracy = accuracy_score(val_labels, val_predictions)
print(f'Validation Accuracy: {accuracy}')

Validation Accuracy: 0.9963768115942029


In [18]:
import torch.nn.functional as F

# 예측 함수 정의
def predict_probabilities(model, tokenizer, text, max_len=128):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=1)
        return probabilities[0].tolist()

# 예측할 텍스트 데이터 파일 경로
test_file_path = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv'

# 테스트 데이터 로딩
test_df = load_data(test_file_path)

# 결과 예측 및 저장 (레이블 1에 해당하는 확률만을 'generated'에 추가)
probabilities_list = []

for _, row in test_df.iterrows():
    text = row['text']
    probabilities = predict_probabilities(model, tokenizer, text, max_len=MAX_LEN)
    probabilities_list.append(probabilities[1])  # 레이블 1에 해당하는 확률만 추가

# 예측 결과에서 'id'와 'generated'만을 포함하여 DataFrame 생성
result_df = pd.DataFrame({
    'id': test_df['id'],
    'generated': probabilities_list
})

# 예측 결과를 'submission.csv' 파일로 저장
result_df.to_csv('/kaggle/working/submission.csv', index=False)