# **Dataset**

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Chỉ sử dụng 50% bộ dataset
# 25,000 dòng và 2 cột (text: review phim, label: đánh nhãn là 0 hoặc 1)
# 0: Negative, 1: Positive
# Giảm xuống 50% => 12500 dòng => 6250 negative và 6250 positive
# Tăng thời gian finetune => thấy được kết quả lập tức

from datasets import concatenate_datasets

train_dataset = dataset["train"]
positive = train_dataset.filter(lambda x: x["label"] == 1)
negative = train_dataset.filter(lambda x: x["label"] == 0)

# Lấy 50% ở mỗi lớp
subset_pos = positive.shuffle(seed=42).select(range(6250))
subset_neg = negative.shuffle(seed=42).select(range(6250))

# Gộp 2 bộ dataset và xáo lại
balanced_subset = concatenate_datasets([subset_pos, subset_neg]).shuffle(seed=42)

# Thay vào tập train
dataset["train"] = balanced_subset

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12500
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

# **Tokenizer**

In [None]:
from transformers import AutoTokenizer

model_name = "thainq107/gpt-small-c4"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

In [None]:
# Tạo tokenize function
# tokenizer.pad_token = tokenizer.eos_token
def tokenize(sentence):
    return tokenizer(
        sentence["text"], padding="max_length", truncation=True, max_length=256
    )

# Tokenize toàn bộ tập dataset
tokenized_ds = dataset.map(tokenize, batched=True)
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds.set_format("torch")

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

# **Model**

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Load config từ model pretrained (không load trọng số)
config = AutoConfig.from_pretrained(model_name)
config.num_labels = 2
config.id2label = id2label = {0: 'Negative', 1: 'Positive'}
config.label2id = {'Negative': 0, 'Positive': 1}

model = AutoModelForSequenceClassification.from_config(config)

# Thiết lập pad_token id
model.config.pad_token_id = tokenizer.pad_token_id

config.json:   0%|          | 0.00/756 [00:00<?, ?B/s]

In [None]:
label2id = {'Negative': 0, 'Positive': 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at thainq107/gpt-small-c4 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Training**

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="imdb2-small-gpt2-small",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    fp16=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 1}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcybersoft-codingcamp[0m ([33mcybersoft-codingcamp-cybersoft-academy-o-t-o-chuy-n-gia-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4758,0.405465,0.81604
2,0.3011,0.464854,0.8004
3,0.1869,0.486745,0.83988
4,0.1062,0.594722,0.83488
5,0.053,0.900982,0.82124
6,0.0322,1.151021,0.8212
7,0.0176,1.161281,0.82112
8,0.0096,1.530802,0.81768
9,0.0047,1.510544,0.82028
10,0.0036,1.518435,0.82064


TrainOutput(global_step=1960, training_loss=0.1190948415471583, metrics={'train_runtime': 1686.3837, 'train_samples_per_second': 74.123, 'train_steps_per_second': 1.162, 'total_flos': 3631939584000000.0, 'train_loss': 0.1190948415471583, 'epoch': 10.0})

In [None]:
import torch

def predict_sentiment(text, tokenizer, model):
    # Tokenize input
    # Padding đến độ dài tối đa 256, truncation nếu text dài hơn, và trả về PyTorch tensors (return_tensors="pt").
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt")

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Inference với no_grad để tiết kiệm bộ nhớ
    with torch.no_grad():
        outputs = model(**inputs)

    # Lấy logits và dự đoán nhãn
    logits = outputs.logits
    # Lấy logits (scores từ model), tìm class có score cao nhất (argmax).
    predicted_class_id = logits.argmax().item()
    # Chuyển ID class thành label ("Positive" hoặc "Negative") từ config của model.
    predicted_label = model.config.id2label[predicted_class_id]

    return predicted_label

In [None]:
sample_sentence = "In the next step, we consider the next possible tokens for each of the three branches we created in the previous step."
prediction = predict_sentiment(sample_sentence, tokenizer, model)
print(f"Predicted: {prediction}")

Predicted: Positive


In [None]:
sample_sentence = "This movie is absolutely awful! I hated every minute of it."
prediction = predict_sentiment(sample_sentence, tokenizer, model)
print(f"Predicted: {prediction}")

Predicted: Negative


In [None]:
from huggingface_hub import login

login("hugginhface API key here")

repo_id = "cybersoft123/movie_cls_gpt"
model.push_to_hub(repo_id, commit_message="Upload model")
tokenizer.push_to_hub(repo_id, commit_message="Upload tokenizer")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...cbtwll8/model.safetensors:   0%|          |  553kB /  180MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cybersoft123/movie_cls_gpt/commit/a4b31a0f7f397869babb204c4becbc5f601b8c49', commit_message='Upload tokenizer', commit_description='', oid='a4b31a0f7f397869babb204c4becbc5f601b8c49', pr_url=None, repo_url=RepoUrl('https://huggingface.co/cybersoft123/movie_cls_gpt', endpoint='https://huggingface.co', repo_type='model', repo_id='cybersoft123/movie_cls_gpt'), pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "cybersoft123/movie_cls_gpt"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/954 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

In [None]:
sample_sentence = "This movie is absolutely awful! I hated every minute of it."
prediction = predict_sentiment(sample_sentence, tokenizer, model)
print(f"Predicted: {prediction}")

Predicted: Negative
