In [16]:
import pandas as pd

train = pd.read_csv('train.csv',encoding='latin1')
test = pd.read_csv('test.csv',encoding='latin1')

train = train[['text','sentiment']]
test = test[['text','sentiment']]

In [17]:
train['text'].fillna('',inplace=True)
test['text'].fillna('',inplace=True)
print(train.shape, test.shape)

(27481, 2) (4815, 2)


In [18]:
train = train[train.sentiment != "neutral"]
test = test[test.sentiment != "neutral"]
train = train.dropna(subset=['sentiment'])
test = test.dropna(subset=['sentiment'])

def func(sentiment):
    if sentiment =='positive':
        return 0
    else: return 1
train['sentiment'] = train['sentiment'].apply(func)
test['sentiment'] = test['sentiment'].apply(func)
print(train.shape, test.shape)

(16363, 2) (2104, 2)


In [19]:
train

Unnamed: 0,text,sentiment
1,Sooo SAD I will miss you here in San Diego!!!,1
2,my boss is bullying me...,1
3,what interview! leave me alone,1
4,"Sons of ****, why couldn`t they put them on t...",1
6,2am feedings for the baby are fun when he is a...,0
...,...,...
27475,enjoy ur night,0
27476,wish we could come see u on Denver husband l...,1
27477,I`ve wondered about rake to. The client has ...,1
27478,Yay good for both of you. Enjoy the break - y...,0


In [20]:
# !pip install datasets

In [21]:
from datasets import Dataset

# 데이터셋 객체로 변환
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer.pad_token = tokenizer.eos_token

def preprocess_data(text):
    inputs = tokenizer(text['text'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = text['sentiment']
    return inputs
train = train.map(preprocess_data, batched=True)
test = test.map(preprocess_data, batched=True)

Map:   0%|          | 0/16363 [00:00<?, ? examples/s]

Map:   0%|          | 0/2104 [00:00<?, ? examples/s]

In [26]:
# 필요없는 컬럼 삭제 및 포맷 변경
train = train.remove_columns(['text'])
test = test.remove_columns(['text'])
train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [27]:
# !pip install accelerate -U
# !pip install transformers[torch]

In [30]:
from transformers import Trainer, TrainingArguments
model = AutoModelForSequenceClassification.from_pretrained("openai-community/gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

# 파인튜닝을 위한 학습 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# 모델 학습
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4472,0.393347


TrainOutput(global_step=8182, training_loss=0.4719742313628382, metrics={'train_runtime': 1350.4561, 'train_samples_per_second': 12.117, 'train_steps_per_second': 6.059, 'total_flos': 1100777886056448.0, 'train_loss': 0.4719742313628382, 'epoch': 1.0})

In [41]:
import torch

# 모델을 사용할 디바이스 설정 (cuda가 가능하다면 사용)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def analyze_sentiment(text):
    # 입력 데이터 생성 및 디바이스로 이동
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 입력 텐서를 디바이스로 이동
    model.to(device)  # 모델도 디바이스로 이동
    with torch.no_grad():  # 그래디언트 계산 비활성화
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return prediction

text = "The movie was bad bad bad, i will not recommend this movie to anyone"

result = analyze_sentiment(text)
print(result)

1


In [43]:
# Prediction 2
text = "The movie was good, i will recommend this movie to anyone"

result = analyze_sentiment(text)
print(result)

0
