In [None]:
import pandas as pd
from transformers import AutoTokenizer

df = pd.read_csv('imdb_data.csv')
reviews = df['review'].values
sentiments = df['sentiment'].values
labels = (sentiments == 'positive').astype('int')

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
input_datas = tokenizer(reviews[:2].tolist(), max_length=10, truncation=True, padding="longest", return_tensors='pt')

print('Tokenizer輸出:')
print(input_datas)

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch

class IMDB(Dataset):
    def __init__(self, x, y, tokenizer):
        self.x = x
        self.y = y
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        return self.x[index], self.y[index]
       
    def __len__(self):
        return len(self.x)
    
    def collate_fn(self, batch):
        batch_x, batch_y = zip(*batch)
        input_ids = self.tokenizer(batch_x, max_length=512, truncation=True, padding="longest", return_tensors='pt').input_ids
        labels = torch.LongTensor(batch_y)
        return {'input_ids': input_ids, 'labels': labels}

x_train, x_valid, y_train, y_valid = train_test_split(reviews, labels, train_size=0.8, random_state=46, shuffle=True)
trainset = IMDB(x_train, y_train, tokenizer)
validset = IMDB(x_valid, y_valid, tokenizer)

train_loader = DataLoader(trainset, batch_size=8, shuffle=True, collate_fn=trainset.collate_fn)
valid_loader = DataLoader(validset, batch_size=8, shuffle=True, collate_fn=validset.collate_fn)

In [None]:
from transformers import BertForSequenceClassification
import torch.optim as optim
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
from Trainer import Trainer
trainer = Trainer(
    epochs=10, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    model=model, 
    optimizer=[optimizer],
    early_stopping=3
)
trainer.train()