In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    BitsAndBytesConfig, DataCollatorWithPadding, TrainingArguments, Trainer
)
from peft import get_peft_model, LoraConfig
import evaluate
from sklearn.model_selection import train_test_split

# Memuat File Dataset
df = pd.read_csv('Dataset_Main_2000.csv')

# Mapping dan mengubah kolom sentimen menjadi dalam bentuk numerik agar lebih dipahami model
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df['label'] = df['Sentiment'].map(label_mapping)

# Membagi dataset menjadi data training dan data validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# Membagi menjadi 2 dataset yang berbeda yaitu dataset "TRAIN" dan dataset "VALIDATION"
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
print(dataset)

# Pengecekan label untuk memastikan distribusi dan menghindari overfitting
label_counts = np.array(dataset['train']['label'])
percentage_positive = (label_counts == 2).sum() / len(label_counts)
print(f"Percentage of Positive labels in training set: {percentage_positive}")


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['Sentiment', 'Text Tweet', 'label'],
        num_rows: 1599
    })
    validation: Dataset({
        features: ['Sentiment', 'Text Tweet', 'label'],
        num_rows: 400
    })
})
Percentage of Positive labels in training set: 0.3545966228893058


In [None]:
# Konfigurasi dari BnB dengan kuantitasi 4-bit
compute_dtype = getattr(torch, "bfloat16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # Mengaktifkan 4bit quantization
    bnb_4bit_quant_type="nf4", # Normalized Float 4-bit=>format kuantisasi
    bnb_4bit_compute_dtype=compute_dtype, # Tipe untuk presisi komputasi saat melakukan training, harus menggunakan bfloat16 untuk menghindari validation loss nan
    bnb_4bit_use_double_quant=True,
)

model_checkpoint = 'Yellow-AI-NLP/komodo-7b-base'

# Define label maps
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}
label2id = {"Negative": 0, "Neutral": 1, "Positive": 2}

# Load Model hasil "Pre-Trained"
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

# Membuat tokenizer model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# Menambah pad token apabila tidak ada
if tokenizer.pad_token is None:
    print("token pad added")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# tokenize function
def tokenize_function(examples):
    text = examples["Text Tweet"]  # Extract text from "Text Tweet" column
    tokenizer.truncation_side = "left"
    return tokenizer(text, truncation=True, max_length=512, padding="max_length")

# Tokenize the datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove original columns
tokenized_dataset = tokenized_dataset.remove_columns(["Sentiment", "Text Tweet"])

# Memastikan panjang setiap batch dengan menambah pad token
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load accuracy evaluation metric
accuracy = evaluate.load("accuracy")

# Define evaluation function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

# Objek konfigurasi untuk LoRA dari Library Python PEFT(Parameter-Efficient Fine-Tuning)
peft_config = LoraConfig(
    task_type="SEQ_CLS", # Fungsi tugas yaitu Sequence Classification
    r=4, # Menentukan rank dimensi matriks untuk ukuran parameter tambahan yang dilatih, semakin kecil semakin kecil memorinya
    lora_alpha=32, # Faktor skala matriks mengatur kontribusi lora terhadap model (32 = cukup tinggi)
    lora_dropout=0.01, # dibuat 1% agar mengurangi overfitting
    target_modules=['gate_proj', 'up_proj', 'down_proj', 'score.base_layer']
)

# Apply the LoRA configuration to the model
model = get_peft_model(model, peft_config)

# Print trainable parameters to verify
model.print_trainable_parameters()

# Hyperparameters
lr = 1e-5
batch_size = 4
num_epochs = 3

# Memastikan untuk menggunakan GPU (Compute Unified Device Architecture)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training arguments
output_dir = "C:/Users/Komodo/Documents/Komodo-7b/training_final_finetuned_2000_v1"
training_args = TrainingArguments(
    output_dir=output_dir, # Hasil dari model yang sudah train
    learning_rate=lr, # Kecepatan belajar dari model llm, dibuat kecil agar stabil nanti bisa mengganggu training loo dan validation loss
    per_device_train_batch_size=batch_size, # Mengatur berapa banyak sanmpel yang di train setiap kalinya agar lebih cepat, nilai 4 agar tidak banyak membebeani GPU
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Create trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 6/6 [03:48<00:00, 38.03s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at Yellow-AI-NLP/komodo-7b-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


token pad added


Map: 100%|██████████| 1599/1599 [00:00<00:00, 1767.19 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 8771.07 examples/s]


trainable params: 5,812,224 || all params: 6,625,492,992 || trainable%: 0.0877


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.481794,{'accuracy': 0.8175}
2,0.791300,0.43335,{'accuracy': 0.9025}
3,0.244800,0.395311,{'accuracy': 0.91}




TrainOutput(global_step=1200, training_loss=0.4640062681833903, metrics={'train_runtime': 36904.3726, 'train_samples_per_second': 0.13, 'train_steps_per_second': 0.033, 'total_flos': 9.552265766712115e+16, 'train_loss': 0.4640062681833903, 'epoch': 3.0})

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

text_list = ["Biasa saja sih","Gausah ditonton gabener banget","Koten ga guna","Bego ni yang buat video","Konten ga lengkap"]

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)  # Ensure inputs are on the same device as the model

    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])


Trained model predictions:
--------------------------
Biasa saja sih - Neutral
Gausah ditonton gabener banget - Negative
Koten ga guna - Negative
Bego ni yang buat video - Negative
Konten ga lengkap - Negative


In [4]:
from sklearn.metrics import confusion_matrix

# Step 1: Get predictions from the model on the validation set
predictions = trainer.predict(tokenized_dataset["validation"])

# Step 2: Extract predicted labels and true labels
preds = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

# Step 3: Calculate the confusion matrix
conf_matrix = confusion_matrix(true_labels, preds)

# Step 4: Print or save the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[129   8   1]
 [ 11 101   3]
 [  4   9 134]]


In [5]:
from sklearn.metrics import classification_report

# Assuming `true_labels` and `preds` from the previous prediction code
print("Classification Report:")
print(classification_report(true_labels, preds, target_names=["Negative", "Neutral", "Positive"]))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.93      0.91       138
     Neutral       0.86      0.88      0.87       115
    Positive       0.97      0.91      0.94       147

    accuracy                           0.91       400
   macro avg       0.91      0.91      0.91       400
weighted avg       0.91      0.91      0.91       400

