In [1]:
pip install transformers datasets


Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

# Memuat dataset IMDB
dataset = load_dataset("imdb")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
from transformers import AutoTokenizer

# Memuat tokenizer pre-trained
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Fungsi preprocessing
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Preprocessing dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(5000))  # Gunakan subset kecil untuk latihan
eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))    # Gunakan subset kecil untuk evaluasi


In [18]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# Membuat data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Membuat DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
eval_loader = DataLoader(eval_dataset, batch_size=8, collate_fn=data_collator)


In [19]:
from transformers import AutoModelForSequenceClassification

# Memuat model pre-trained
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to("cuda")  # Pindahkan model ke GPU


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [20]:
import torch

# Optimizer dan loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()


In [27]:
# Fungsi preprocessing tanpa padding global
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

# Tokenisasi dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Hapus kolom asli yang tidak diperlukan
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
from torch.utils.data import DataLoader

# Membuat DataLoader tanpa padding
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=1)
# Training loop
model.train()
for epoch in range(3):  # Jumlah epochs
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Pindahkan input dan label ke GPU
        inputs = {key: torch.tensor(val).squeeze(0).to("cuda") for key, val in batch.items() if key in tokenizer.model_input_names}
        labels = torch.tensor(batch["label"]).to("cuda").squeeze(0)
        
        # Forward pass
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item()
        
        # Backward pass dan optimasi
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} selesai. Loss rata-rata: {total_loss / len(train_loader):.4f}")



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

  labels = torch.tensor(batch["label"]).to("cuda").squeeze(0)


ValueError: not enough values to unpack (expected 2, got 1)

In [28]:
# Evaluasi
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in eval_loader:
        inputs = {key: val.to("cuda") for key, val in batch.items() if key in tokenizer.model_input_names}
        labels = batch["label"].to("cuda")
        
        # Forward pass
        outputs = model(**inputs)
        _, predicted = torch.max(outputs.logits, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Akurasi pada dataset validasi: {accuracy * 100:.2f}%")


AttributeError: 'list' object has no attribute 'to'

In [None]:
# Menyimpan model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


In [None]:
from transformers import pipeline

# Memuat pipeline klasifikasi
classifier = pipeline("text-classification", model="./fine_tuned_model", tokenizer="./fine_tuned_model")

# Prediksi teks
texts = ["The movie was fantastic!", "I didn't enjoy the movie at all."]
results = classifier(texts)

# Menampilkan hasil prediksi
print(results)
