<a href="https://colab.research.google.com/github/AE-1129/sentiment-analysis-example/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate -q

import os
os.environ["WANDB_DISABLED"] = "true"


from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from datasets import load_dataset
import evaluate
import transformers
from packaging.version import parse

print("transformers version:", transformers.__version__)


MODEL_CHECKPOINT = "distilbert-base-uncased"
MAX_LENGTH = 256
TRAIN_SUBSET = 5000
EVAL_SUBSET = 1000
BATCH_SIZE = 16
NUM_EPOCHS = 2
LEARNING_RATE = 2e-5
OUTPUT_DIR = "./distilbert-imdb-results"


raw_datasets = load_dataset("imdb")
print(raw_datasets)


tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)


def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

tokenized = raw_datasets.map(tokenize_function, batched=True, remove_columns=["text"])


if TRAIN_SUBSET is not None:
    train_dataset = tokenized["train"].shuffle(seed=42).select(range(TRAIN_SUBSET))
else:
    train_dataset = tokenized["train"].shuffle(seed=42)

if EVAL_SUBSET is not None:
    eval_dataset = tokenized["test"].select(range(EVAL_SUBSET))
else:
    eval_dataset = tokenized["test"]

print("train size:", len(train_dataset), "eval size:", len(eval_dataset))


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }


tf_ver = parse(transformers.__version__)
training_kwargs = dict(
    output_dir=OUTPUT_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_dir=f"{OUTPUT_DIR}/logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


if tf_ver >= parse("4.46"):
    training_kwargs["eval_strategy"] = "epoch"
    training_kwargs["save_strategy"] = "epoch"
else:
    training_kwargs["evaluation_strategy"] = "epoch"
    training_kwargs["save_strategy"] = "epoch"

training_args = TrainingArguments(**training_kwargs)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.select(range(5000)),
    eval_dataset=eval_dataset.select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()


metrics = trainer.evaluate(eval_dataset=eval_dataset)
print("Final evaluation:", metrics)


trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved model to", OUTPUT_DIR)


examples = [
    "This movie was a fantastic masterpiece, I loved the acting and story.",
    "I wasted two hours. The plot is boring and the acting was terrible.",
]
inputs = tokenizer(examples, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt")
outputs = model(**inputs)
preds = outputs.logits.argmax(axis=-1).tolist()
for ex, p in zip(examples, preds):
    print(p, "->", ex)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25htransformers version: 4.56.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

train size: 5000 eval size: 1000


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.675981,0.706,0.413834




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.675981,0.706,0.413834
2,0.320400,0.348201,0.87,0.465241




Final evaluation: {'eval_loss': 0.34820103645324707, 'eval_accuracy': 0.87, 'eval_f1': 0.46524064171123, 'eval_runtime': 409.0182, 'eval_samples_per_second': 2.445, 'eval_steps_per_second': 0.154, 'epoch': 2.0}
Saved model to ./distilbert-imdb-results
1 -> This movie was a fantastic masterpiece, I loved the acting and story.
0 -> I wasted two hours. The plot is boring and the acting was terrible.
