In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report,
    multilabel_confusion_matrix,
    hamming_loss,
    jaccard_score,
    accuracy_score,
    precision_recall_fscore_support
)

from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer
)

from peft import LoraConfig, get_peft_model
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from datasets import Dataset


In [None]:
#preprocess imdb dataset

def preprocess_imdb(
    input_csv: str,
    output_csv: str = "imdb_overview_genres_clean.csv",
    sample_size: int = 5000,
    random_state: int = 42
):
    """
    Preprocess IMDB dataset:
    - Keep only overview and genres
    - Remove NaN and empty rows
    - Normalize genres into multi-label format (A|B|C)
    - Randomly sample N entries
    - Save cleaned dataset to CSV
    """

    # Load dataset
    df = pd.read_csv(input_csv)

    # Select relevant columns
    df = df[["Overview", "Generes"]].copy()
    df.columns = ["overview", "genres"]

    # Drop NaN
    df.dropna(inplace=True)

    # Remove empty strings
    df = df[
        (df["overview"].str.strip() != "") &
        (df["genres"].str.strip() != "")
    ]

    # Normalize genre format: "Drama, Action" â†’ "Drama|Action"
    df["genres"] = df["genres"].apply(
        lambda x: "|".join([g.strip() for g in x.split(",")])
    )

    # Reset index before sampling
    df.reset_index(drop=True, inplace=True)

    # ðŸ”¹ Random sampling
    if sample_size is not None and sample_size < len(df):
        df = df.sample(
            n=sample_size,
            random_state=random_state
        ).reset_index(drop=True)

    # Save cleaned dataset
    df.to_csv(output_csv, index=False)

    print("Preprocessing selesai.")
    print(f"Total data akhir: {len(df)}")
    print(f"File disimpan di: {output_csv}")

    return df
INPUT_PATH = "25k IMDb movie Dataset.csv"
OUTPUT_PATH = "imdb_overview_genres_clean.csv"

preprocess_imdb(
    input_csv=INPUT_PATH,
    output_csv=OUTPUT_PATH,
    sample_size=5000
)


Preprocessing selesai.
Total data akhir: 5000
File disimpan di: imdb_overview_genres_clean.csv


Unnamed: 0,overview,genres
0,The lives of two strangers and their young chi...,['Comedy'|'Drama'|'Romance']
1,A young gangster and his uncle attempt to outw...,['Action']
2,Two best friends set out to rescue their pal a...,['Animation'|'Adventure'|'Comedy']
3,An escort-service owner uses his girls to get ...,['Biography'|'Drama']
4,An F.B.I. Agent goes undercover to catch a gan...,['Action'|'Crime'|'Thriller']
...,...,...
4995,A drug kingpin is released from prison and see...,['Crime'|'Thriller']
4996,"As her world is shrouded in darkness, a young ...",['Animation'|'Adventure'|'Fantasy']
4997,The suite of the New Adventures of Aladin.,['Comedy'|'Fantasy']
4998,Young Jim Hawkins is torn between his loyalty ...,['Adventure'|'Family']


In [None]:
# Load cleaned dataset
DATA_PATH = "imdb_overview_genres_clean.csv"

df = pd.read_csv(DATA_PATH)
df["genres"] = df["genres"].apply(lambda x: x.split("|"))


In [None]:
# Multi-label binarization
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df["genres"])

NUM_LABELS = labels.shape[1]
print("Jumlah genre:", NUM_LABELS)
print("Genre:", mlb.classes_)


Jumlah genre: 71
Genre: ["'Action'" "'Action']" "'Adventure'" "'Adventure']" "'Biography'"
 "'Biography']" "'Comedy'" "'Comedy']" "'Crime'" "'Crime']" "'Drama'"
 "'Drama']" "'Family'" "'Family']" "'Fantasy'" "'Fantasy']" "'Film-Noir'"
 "'Film-Noir']" "'Game-Show']" "'History'" "'History']" "'Horror'"
 "'Horror']" "'Music'" "'Music']" "'Musical'" "'Musical']" "'Mystery'"
 "'Mystery']" "'Reality-TV'" "'Romance'" "'Romance']" "'Sci-Fi'"
 "'Sci-Fi']" "'Sport'" "'Sport']" "'Thriller'" "'Thriller']" "'War'"
 "'War']" "'Western']" "['Action'" "['Action']" "['Adventure'"
 "['Adventure']" "['Animation'" "['Animation']" "['Biography'" "['Comedy'"
 "['Comedy']" "['Crime'" "['Crime']" "['Drama'" "['Drama']" "['Family'"
 "['Fantasy'" "['Film-Noir'" "['History'" "['Horror'" "['Horror']"
 "['Musical'" "['Mystery'" "['Romance'" "['Romance']" "['Sci-Fi'"
 "['Sci-Fi']" "['Thriller'" "['Thriller']" "['War'" "['War']"
 "['Western']"]


In [None]:
# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    df["overview"], labels, test_size=0.3, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)


In [None]:
# Tokenizer initialization
MODEL_NAME = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Dataset preparation
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

def build_dataset(texts, labels):
    ds = Dataset.from_dict({
        "text": texts.tolist(),
        "labels": labels
    })
    return ds.map(tokenize, batched=True)

train_ds = build_dataset(X_train, y_train)
val_ds   = build_dataset(X_val, y_val)
test_ds  = build_dataset(X_test, y_test)


Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [None]:
# Model definition
class QwenForMultiLabelClassification(nn.Module):
    def __init__(self, base_model, num_labels):
        super().__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(
            base_model.config.hidden_size,
            num_labels
        )

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        last_hidden = outputs.last_hidden_state[:, -1, :]
        logits = self.classifier(last_hidden)

        loss = None
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels.float())

        return {"loss": loss, "logits": logits}


In [None]:
# Load base model and apply LoRA
base_model = AutoModel.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

base_model = get_peft_model(base_model, lora_config)
model = QwenForMultiLabelClassification(base_model, NUM_LABELS)


In [None]:
# Metric computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    logits = torch.from_numpy(logits)
    labels = torch.from_numpy(labels)

    probs = torch.sigmoid(logits)
    preds = (probs > 0.5).int()

    return {
        "f1_micro": f1_score(
            labels.numpy(), preds.numpy(), average="micro"
        ),
        "f1_macro": f1_score(
            labels.numpy(), preds.numpy(), average="macro"
        ),
    }


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen3_genre_results",
    eval_strategy="epoch",    
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    fp16=torch.cuda.is_available(),
    report_to="none",
    no_cuda=True
)




In [None]:
# Trainer initialization and training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro
1,No log,0.112492,0.105769,0.021086
2,0.137300,0.105084,0.185315,0.05213
3,0.100200,0.104207,0.232179,0.080696
4,0.089500,0.104996,0.244409,0.093174


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


TrainOutput(global_step=1752, training_loss=0.1056050459543864, metrics={'train_runtime': 57051.8715, 'train_samples_per_second': 0.245, 'train_steps_per_second': 0.031, 'total_flos': 0.0, 'train_loss': 0.1056050459543864, 'epoch': 4.0})

In [None]:
test_results = trainer.evaluate(test_ds)
print(test_results)

{'eval_loss': 0.10354158282279968, 'eval_f1_micro': 0.23468161794152984, 'eval_f1_macro': 0.0743633273050382, 'eval_runtime': 1079.4795, 'eval_samples_per_second': 0.695, 'eval_steps_per_second': 0.087, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# SAVE TRAINED MODEL AND ARTIFACTS

# Create directory for saved model
os.makedirs("./saved_model", exist_ok=True)

# 1. Save the LoRA adapter (base_model)
model.base_model.save_pretrained("./saved_model")
print("âœ“ LoRA adapter saved to './saved_model'")

# 2. Save the classifier head separately
classifier_state = {
    'classifier': model.classifier.state_dict()
}
torch.save(classifier_state, "./saved_model/classifier_head.pt")
print("âœ“ Classifier head saved to './saved_model/classifier_head.pt'")

# 3. Save the tokenizer
tokenizer.save_pretrained("./saved_tokenizer")
print("âœ“ Tokenizer saved to './saved_tokenizer'")

# 4. Save the MultiLabelBinarizer
with open("mlb.pkl", "wb") as f:
    pickle.dump(mlb, f)
print("âœ“ MultiLabelBinarizer saved to 'mlb.pkl'")

# 5. Save model configuration info
config_info = {
    "num_labels": NUM_LABELS,
    "model_name": MODEL_NAME,
    "genres": mlb.classes_.tolist()
}

with open("model_config.pkl", "wb") as f:
    pickle.dump(config_info, f)
print("âœ“ Model config saved to 'model_config.pkl'")

print("\n" + "="*60)
print("ALL ARTIFACTS SAVED SUCCESSFULLY!")
print("="*60)
print("\nSaved files:")
print("  - ./saved_model/ (LoRA adapter + classifier head)")
print("  - ./saved_tokenizer/ (tokenizer)")
print("  - mlb.pkl (MultiLabelBinarizer)")
print("  - model_config.pkl (configuration)")

âœ“ LoRA adapter saved to './saved_model'
âœ“ Classifier head saved to './saved_model/classifier_head.pt'
âœ“ Tokenizer saved to './saved_tokenizer'
âœ“ MultiLabelBinarizer saved to 'mlb.pkl'
âœ“ Model config saved to 'model_config.pkl'

ALL ARTIFACTS SAVED SUCCESSFULLY!

Saved files:
  - ./saved_model/ (LoRA adapter + classifier head)
  - ./saved_tokenizer/ (tokenizer)
  - mlb.pkl (MultiLabelBinarizer)
  - model_config.pkl (configuration)
