In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import csv
import time
import logging
import pandas as pd  # Add this import
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import numpy as np  # Add this import
# Convert DataFrame to DatasetDict format
from datasets import Dataset

# 로깅 설정: 프로그램이 실행되는 동안 정보를 출력하는 설정입니다.
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', filename='./logs/imdb.log')

# 콘솔 핸들러 추가
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

# 시간 측정을 위한 변수들: 각 과정에 걸린 시간을 저장합니다.
start_time = time.time()
times = {}

# IMDB 데이터셋을 불러옵니다.
logging.info('IMDB 데이터셋 로드 시작')
data_start_time = time.time()

# Load specific columns from CSV
# add row number
df = pd.read_csv('./csv/imdb.csv', usecols=['review', 'sentiment'])

# Add row number as 'id'
df['input_ids'] = range(1, len(df) + 1)

# Reorder columns to make 'id' the first column
df = df[['input_ids', 'review', 'sentiment']]

print(df.columns)
print(df.head())



# Assuming 'sentiment' is the label and 'review' is the text
df.rename(columns={'review': 'text', 'sentiment': 'label'}, inplace=True)

# random 으로 데이터 나누기
df['split'] = df['label'].apply(lambda x: 'train' if x == 1 else 'test')

# Split the DataFrame into train and test sets
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

ds = {'train': train_dataset, 'test': test_dataset}


data_end_time = time.time()
times['데이터 로드'] = data_end_time - data_start_time
logging.info(f'데이터 로드 시간: {times["데이터 로드"]} 초')


2024-08-07 03:46:49,267 - IMDB 데이터셋 로드 시작
2024-08-07 03:46:49,267 - IMDB 데이터셋 로드 시작
2024-08-07 03:46:49,847 - 데이터 로드 시간: 0.5780479907989502 초
2024-08-07 03:46:49,847 - 데이터 로드 시간: 0.5780479907989502 초


Index(['input_ids', 'review', 'sentiment'], dtype='object')
   input_ids                                             review sentiment
0          1  One of the other reviewers has mentioned that ...  positive
1          2  A wonderful little production. <br /><br />The...  positive
2          3  I thought this was a wonderful way to spend ti...  positive
3          4  Basically there's a family where a little boy ...  negative
4          5  Petter Mattei's "Love in the Time of Money" is...  positive


2024-08-07 03:47:44,973 - 데이터 전처리 및 토큰화 시작
2024-08-07 03:47:44,973 - 데이터 전처리 및 토큰화 시작


In [26]:


# 데이터 전처리 및 토큰화
logging.info('데이터 전처리 및 토큰화 시작')
preprocessing_start_time = time.time()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Apply tokenization and remove the 'split' column
train_data = ds['train'].map(tokenize_function, batched=True, remove_columns=['split'])
test_data = ds['test'].map(tokenize_function, batched=True, remove_columns=['split'])

print(train_data)
print(test_data)


2024-08-07 03:57:48,624 - 데이터 전처리 및 토큰화 시작
2024-08-07 03:57:48,624 - 데이터 전처리 및 토큰화 시작
Map: 100%|██████████| 50000/50000 [01:21<00:00, 613.24 examples/s]

Dataset({
    features: ['input_ids', 'text', 'label', '__index_level_0__'],
    num_rows: 0
})
Dataset({
    features: ['input_ids', 'text', 'label', 'token_type_ids', 'attention_mask'],
    num_rows: 50000
})





In [17]:
# Define a function to add attention_mask
def add_attention_mask(example):
    example['attention_mask'] = [1] * len(example['input_ids'])
    return example

# Apply the function to the dataset
train_data = train_data.map(add_attention_mask)
test_data = test_data.map(add_attention_mask)

Map: 100%|██████████| 50000/50000 [00:08<00:00, 5929.43 examples/s]


In [25]:
# Define a function to add attention_mask
print(train_data)


Dataset({
    features: ['input_ids', 'text', 'label', '__index_level_0__'],
    num_rows: 0
})


In [None]:
def add_attention_mask(example):
    example['attention_mask'] = [1] * len(example['input_ids'])
    return example

# Apply the function to the dataset
train_data = train_data.map(add_attention_mask)
test_data = test_data.map(add_attention_mask)


In [24]:
# train_data print
print(train_data)

Dataset({
    features: ['input_ids', 'text', 'label', '__index_level_0__'],
    num_rows: 0
})


In [22]:

# Set format to 'torch'
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


ValueError: Columns ['attention_mask'] not in the dataset. Current columns in the dataset: ['input_ids', 'text', 'label', '__index_level_0__']

In [None]:

# Convert to TensorDataset
train_dataset = TensorDataset(
    torch.tensor(np.asarray(train_data['input_ids'])), 
    torch.tensor(np.asarray(train_data['attention_mask'])),
    torch.tensor(np.asarray(train_data['label']))
)
test_dataset = TensorDataset(
    torch.tensor(np.asarray(test_data['input_ids'])), 
    torch.tensor(np.asarray(test_data['attention_mask'])),
    torch.tensor(np.asarray(test_data['label']))
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

preprocessing_end_time = time.time()
times['데이터 전처리 및 토큰화'] = preprocessing_end_time - preprocessing_start_time
logging.info(f'데이터 전처리 및 토큰화 시간: {times["데이터 전처리 및 토큰화"]} 초')

# 모델 정의
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cpu')

# 손실 함수 및 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# 모델 학습
logging.info('모델 학습 시작')
train_start_time = time.time()

def train(dataloader):
    model.train()
    total_loss = 0
    total_acc = 0

    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        total_acc += accuracy_score(labels.numpy(), preds.numpy())

    return total_loss / len(dataloader), total_acc / len(dataloader)

train_loss, train_acc = train(train_dataloader)

train_end_time = time.time()
times['모델 학습'] = train_end_time - train_start_time
logging.info(f'모델 학습 시간: {times["모델 학습"]} 초')

# 모델 평가
logging.info('모델 평가 시작')
test_start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_loss = 0
    total_acc = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            total_acc += accuracy_score(labels.numpy(), preds.numpy())

    return total_loss / len(dataloader), total_acc / len(dataloader)

test_loss, test_acc = evaluate(test_dataloader)

test_end_time = time.time()
times['모델 평가'] = test_end_time - test_start_time
logging.info(f'모델 평가 시간: {times["모델 평가"]} 초')

# 결과 저장
total_time = time.time() - start_time
logging.info(f'전체 소요 시간: {total_time} 초')

with open('./csv/imdb_result.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Total Time', total_time])
    writer.writerow(['Step', 'Time (seconds)'])
    for step, time_taken in times.items():
        writer.writerow([step, time_taken])
    writer.writerow(['Train Loss', train_loss])
    writer.writerow(['Train Accuracy', train_acc])
    writer.writerow(['Test Loss', test_loss])
    writer.writerow(['Test Accuracy', test_acc])

# 여러 개의 플롯을 함께 표시합니다.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

steps = list(times.keys())
times_taken = list(times.values())

ax1.bar(steps, times_taken)
ax1.set_title('Step Times')
ax1.set_xlabel('Step')
ax1.set_ylabel('Time (seconds)')

ax2.bar(['Train Accuracy', 'Test Accuracy'], [train_acc, test_acc])
ax2.set_title('Accuracy')
ax2.set_ylabel('Accuracy')

plt.show()

In [None]:
# Define a function to add attention_mask
def add_attention_mask(example):
    example['attention_mask'] = [1] * len(example['input_ids'])
    return example

# Apply the function to the dataset
train_data = train_data.map(add_attention_mask)
test_data = test_data.map(add_attention_mask)

# Set format to 'torch'
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Convert to TensorDataset
train_dataset = TensorDataset(
    torch.tensor(np.asarray(train_data['input_ids'])), 
    torch.tensor(np.asarray(train_data['attention_mask'])),
    torch.tensor(np.asarray(train_data['label']))
)
test_dataset = TensorDataset(
    torch.tensor(np.asarray(test_data['input_ids'])), 
    torch.tensor(np.asarray(test_data['attention_mask'])),
    torch.tensor(np.asarray(test_data['label']))
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

preprocessing_end_time = time.time()
times['데이터 전처리 및 토큰화'] = preprocessing_end_time - preprocessing_start_time
logging.info(f'데이터 전처리 및 토큰화 시간: {times["데이터 전처리 및 토큰화"]} 초')

# 모델 정의
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cpu')

# 손실 함수 및 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# 모델 학습
logging.info('모델 학습 시작')
train_start_time = time.time()

def train(dataloader):
    model.train()
    total_loss = 0
    total_acc = 0

    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        total_acc += accuracy_score(labels.numpy(), preds.numpy())

    return total_loss / len(dataloader), total_acc / len(dataloader)

train_loss, train_acc = train(train_dataloader)

train_end_time = time.time()
times['모델 학습'] = train_end_time - train_start_time
logging.info(f'모델 학습 시간: {times["모델 학습"]} 초')

# 모델 평가
logging.info('모델 평가 시작')
test_start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_loss = 0
    total_acc = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            total_acc += accuracy_score(labels.numpy(), preds.numpy())

    return total_loss / len(dataloader), total_acc / len(dataloader)

test_loss, test_acc = evaluate(test_dataloader)

test_end_time = time.time()
times['모델 평가'] = test_end_time - test_start_time
logging.info(f'모델 평가 시간: {times["모델 평가"]} 초')

# 결과 저장
total_time = time.time() - start_time
logging.info(f'전체 소요 시간: {total_time} 초')

with open('./csv/imdb_result.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Total Time', total_time])
    writer.writerow(['Step', 'Time (seconds)'])
    for step, time_taken in times.items():
        writer.writerow([step, time_taken])
    writer.writerow(['Train Loss', train_loss])
    writer.writerow(['Train Accuracy', train_acc])
    writer.writerow(['Test Loss', test_loss])
    writer.writerow(['Test Accuracy', test_acc])

# 여러 개의 플롯을 함께 표시합니다.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

steps = list(times.keys())
times_taken = list(times.values())

ax1.bar(steps, times_taken)
ax1.set_title('Step Times')
ax1.set_xlabel('Step')
ax1.set_ylabel('Time (seconds)')

ax2.bar(['Train Accuracy', 'Test Accuracy'], [train_acc, test_acc])
ax2.set_title('Accuracy')
ax2.set_ylabel('Accuracy')

plt.show()

In [None]:
n_mask
def add_attention_mask(example):
    example['attention_mask'] = [1] * len(example['input_ids'])
    return example

# Apply the function to the dataset
train_data = train_data.map(add_attention_mask)
test_data = test_data.map(add_attention_mask)

# Set format to 'torch'
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Convert to TensorDataset
train_dataset = TensorDataset(
    torch.tensor(np.asarray(train_data['input_ids'])), 
    torch.tensor(np.asarray(train_data['attention_mask'])),
    torch.tensor(np.asarray(train_data['label']))
)
test_dataset = TensorDataset(
    torch.tensor(np.asarray(test_data['input_ids'])), 
    torch.tensor(np.asarray(test_data['attention_mask'])),
    torch.tensor(np.asarray(test_data['label']))
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

preprocessing_end_time = time.time()
times['데이터 전처리 및 토큰화'] = preprocessing_end_time - preprocessing_start_time
logging.info(f'데이터 전처리 및 토큰화 시간: {times["데이터 전처리 및 토큰화"]} 초')

# 모델 정의
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cpu')

# 손실 함수 및 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# 모델 학습
logging.info('모델 학습 시작')
train_start_time = time.time()

def train(dataloader):
    model.train()
    total_loss = 0
    total_acc = 0

    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        total_acc += accuracy_score(labels.numpy(), preds.numpy())

    return total_loss / len(dataloader), total_acc / len(dataloader)

train_loss, train_acc = train(train_dataloader)

train_end_time = time.time()
times['모델 학습'] = train_end_time - train_start_time
logging.info(f'모델 학습 시간: {times["모델 학습"]} 초')

# 모델 평가
logging.info('모델 평가 시작')
test_start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_loss = 0
    total_acc = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            total_acc += accuracy_score(labels.numpy(), preds.numpy())

    return total_loss / len(dataloader), total_acc / len(dataloader)

test_loss, test_acc = evaluate(test_dataloader)

test_end_time = time.time()
times['모델 평가'] = test_end_time - test_start_time
logging.info(f'모델 평가 시간: {times["모델 평가"]} 초')

# 결과 저장
total_time = time.time() - start_time
logging.info(f'전체 소요 시간: {total_time} 초')

with open('./csv/imdb_result.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Total Time', total_time])
    writer.writerow(['Step', 'Time (seconds)'])
    for step, time_taken in times.items():
        writer.writerow([step, time_taken])
    writer.writerow(['Train Loss', train_loss])
    writer.writerow(['Train Accuracy', train_acc])
    writer.writerow(['Test Loss', test_loss])
    writer.writerow(['Test Accuracy', test_acc])

# 여러 개의 플롯을 함께 표시합니다.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

steps = list(times.keys())
times_taken = list(times.values())

ax1.bar(steps, times_taken)
ax1.set_title('Step Times')
ax1.set_xlabel('Step')
ax1.set_ylabel('Time (seconds)')

ax2.bar(['Train Accuracy', 'Test Accuracy'], [train_acc, test_acc])
ax2.set_title('Accuracy')
ax2.set_ylabel('Accuracy')

plt.show()