In [1]:
import pandas as pd
import re
import nltk
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# 데이터 로드
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submit = pd.read_csv("sample_submission.csv")

# 전처리 함수 정의
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def data_text_cleaning(data):
    # 영문자 이외 문자는 공백으로 변환
    only_english = re.sub('[^a-zA-Z]', ' ', data)
    # 소문자 변환
    no_capitals = only_english.lower()
    # 불용어 제거
    tokens = nltk.word_tokenize(no_capitals)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# 데이터 전처리
train_df['text_cleaned'] = train_df['text'].apply(data_text_cleaning)
test_df['text_cleaned'] = test_df['text'].apply(data_text_cleaning)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daehyunkim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# BERT 토크나이저 및 인코딩
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# 입력 데이터를 BERT 모델에 맞게 변환
train_inputs = tokenizer(train_df['text_cleaned'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)
test_inputs = tokenizer(test_df['text_cleaned'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

# 레이블 데이터
train_labels = torch.tensor(train_df['sentiment'].values)


In [4]:
# 데이터셋 생성
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_inputs['token_type_ids'], train_labels)

# DataLoader 설정
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



In [5]:
# BERT 모델 불러오기
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 'num_labels'는 클래스 수에 맞게 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# 학습 루프
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)

    # 검증 및 평가 코드를 여기에 추가할 수 있습니다.


In [23]:
# 테스트 데이터를 DataLoader로 변환
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_inputs['token_type_ids'])
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [24]:
# 테스트 데이터의 예측
model.eval()
predictions = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).tolist())



In [25]:
# 예측 결과를 submit.csv 파일에 저장
submit['sentiment'] = predictions
submit.to_csv('submit.csv', index=False)