In [14]:
from transformers import pipeline

In [15]:
# load pipeline for sentiment-analysis
sentiment_pipeline = pipeline('sentiment-analysis')

# uji coba
data = ["This movie is fantastic, I love it!", "The plot was boring and the acting was terrible."]
print(sentiment_pipeline(data))


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998769760131836}, {'label': 'NEGATIVE', 'score': 0.9997890591621399}]


In [2]:
from datasets import load_dataset
imdb = load_dataset("imdb")

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=256)
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
)

trainer.train()



Step,Training Loss
500,0.4251
1000,0.373
1500,0.3394
2000,0.3224
2500,0.3122
3000,0.3061
3500,0.2298
4000,0.1885
4500,0.1991
5000,0.1985




TrainOutput(global_step=9375, training_loss=0.206001811319987, metrics={'train_runtime': 39778.5009, 'train_samples_per_second': 1.885, 'train_steps_per_second': 0.236, 'total_flos': 4967527449600000.0, 'train_loss': 0.206001811319987, 'epoch': 3.0})

In [6]:
trainer.evaluate()



{'eval_loss': 0.45617571473121643,
 'eval_runtime': 3301.2143,
 'eval_samples_per_second': 7.573,
 'eval_steps_per_second': 0.947,
 'epoch': 3.0}

In [None]:
# If no best checkpoint, use the current model state
trainer.save_model("./model-sentimen-imdb-final")
# Simpan ke folder model hasil training
tokenizer.save_pretrained("./model-sentimen-imdb-final")


('./model-sentimen-imdb-final\\tokenizer_config.json',
 './model-sentimen-imdb-final\\special_tokens_map.json',
 './model-sentimen-imdb-final\\vocab.txt',
 './model-sentimen-imdb-final\\added_tokens.json',
 './model-sentimen-imdb-final\\tokenizer.json')

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./model-sentimen-imdb-final"

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [2]:
from datasets import load_dataset

imdb = load_dataset("imdb")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=256)

tokenized_imdb = imdb.map(preprocess_function, batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [3]:
from transformers import Trainer
import numpy as np
import evaluate

# load metric
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=predictions, references=labels, average="weighted")["recall"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }

trainer = Trainer(
    model=model,
    eval_dataset=tokenized_imdb["test"],
    compute_metrics=compute_metrics
)

results = trainer.evaluate()
print(results)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]



{'eval_loss': 0.45617571473121643, 'eval_model_preparation_time': 0.0016, 'eval_accuracy': 0.91188, 'eval_precision': 0.9120256263373727, 'eval_recall': 0.91188, 'eval_f1': 0.9118722130287433, 'eval_runtime': 2733.6282, 'eval_samples_per_second': 9.145, 'eval_steps_per_second': 1.143}


In [4]:
from transformers import pipeline

model_path = "./model-sentimen-imdb-final"

# load model dan tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

sentiment_model = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer)

# # mapping id → label
# id2label = {0: "NEGATIVE", 1: "POSITIVE"}
# label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# model.config.id2label = id2label
# model.config.label2id = label2id

# Buat beberapa kalimat baru untuk diuji
kalimat_uji = [
    "The story was predictable, but the visuals were stunning.",
    "I would not recommend this movie to anyone.",
    "A true masterpiece of cinema, absolutely brilliant!",
    "Filmnya biasa saja, tidak ada yang spesial.", # Coba dengan Bahasa Indonesia!
    "jelek banget"
]

# Lakukan prediksi
hasil = sentiment_model(kalimat_uji)
for i in hasil:
    print(i)

Device set to use cpu


{'label': 'POSITIVE', 'score': 0.9946309328079224}
{'label': 'NEGATIVE', 'score': 0.9996973276138306}
{'label': 'POSITIVE', 'score': 0.9993736147880554}
{'label': 'NEGATIVE', 'score': 0.976233184337616}
{'label': 'POSITIVE', 'score': 0.726354718208313}
