In [None]:
!pip install transformers
!pip install pandas
!pip install scikit-learn
!pip install imbalanced-learn
!pip install emoji
!pip install google-play-scraper

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1
Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
import emoji

# Load data
df = pd.read_csv("line_messenger_reviews.csv")
print("Data shape:", df.shape)
print(df.head())

# Tampilkan value counts untuk setiap topik
print("\nDistribusi Topic 1 (Pengalaman Umum Penggunaan LINE):")
print(df['Topic 1_Pengalaman_Umum_Penggunaan_LINE'].value_counts())
print("\nDistribusi Topic 2 (Fitur Tambahan):")
print(df['Topic_2_Fitur_Tambahan'].value_counts())
print("\nDistribusi Topic 3 (Login dan Registrasi Akun):")
print(df['Topic_3_Login_dan_Registrasi_Akun'].value_counts())

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # 1. lowercase
    text = text.lower()
    # 2. hapus URL
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    # 3. hapus emoji
    all_chars = list(text)
    text = "".join([c for c in all_chars if not emoji.is_emoji(c)])
    # 4. hapus angka
    text = re.sub(r"\d+", "", text)
    # 5. hapus karakter non-alfabet
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # 6. hapus spasi ganda
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Terapkan cleaning ke kolom 'content'
df['content_clean'] = df['content'].apply(clean_text)

print("Sebelum drop:", df.shape)
# Drop baris yang memiliki nilai null di kolom penting
df = df.dropna(subset=['content_clean', 'Topic 1_Pengalaman_Umum_Penggunaan_LINE',
                       'Topic_2_Fitur_Tambahan', 'Topic_3_Login_dan_Registrasi_Akun'])
print("Sesudah drop:", df.shape)

# Tampilkan daftar unik untuk setiap topik
print("\nUnique values Topic 1:")
print(repr(df['Topic 1_Pengalaman_Umum_Penggunaan_LINE'].unique()))
print("\nUnique values Topic 2:")
print(repr(df['Topic_2_Fitur_Tambahan'].unique()))
print("\nUnique values Topic 3:")
print(repr(df['Topic_3_Login_dan_Registrasi_Akun'].unique()))

# Label encoding untuk setiap topik
le_topic1 = LabelEncoder()
le_topic2 = LabelEncoder()
le_topic3 = LabelEncoder()

df['label_topic1'] = le_topic1.fit_transform(df['Topic 1_Pengalaman_Umum_Penggunaan_LINE'])
df['label_topic2'] = le_topic2.fit_transform(df['Topic_2_Fitur_Tambahan'])
df['label_topic3'] = le_topic3.fit_transform(df['Topic_3_Login_dan_Registrasi_Akun'])

# Tampilkan mapping untuk setiap topik
print("\nMapping Topic 1 (Pengalaman Umum Penggunaan LINE):")
print(dict(zip(le_topic1.classes_, le_topic1.transform(le_topic1.classes_))))
print("\nMapping Topic 2 (Fitur Tambahan):")
print(dict(zip(le_topic2.classes_, le_topic2.transform(le_topic2.classes_))))
print("\nMapping Topic 3 (Login dan Registrasi Akun):")
print(dict(zip(le_topic3.classes_, le_topic3.transform(le_topic3.classes_))))


Data shape: (999, 4)
                                             content  \
0                 tidak jelas tidak bisa nambah temn   
1                        dipersulit buat login doang   
2  "Error yang tidak diketahui". Tolong lah yang ...   
3                                          gak jelas   
4  KEMBALIKAN LINE YG DULU DEVELOPER GIMANA SIH I...   

  Topic 1_Pengalaman_Umum_Penggunaan_LINE Topic_2_Fitur_Tambahan  \
0                                 Negatif                 Netral   
1                                  Netral                 Netral   
2                                 Negatif                 Netral   
3                                 Negatif                 Netral   
4                                 Negatif                 Netral   

  Topic_3_Login_dan_Registrasi_Akun  
0                            Netral  
1                           Negatif  
2                            Netral  
3                            Netral  
4                            Netral  

Dist

In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import torch
import pandas as pd

# 1. Ekstrak teks dan label untuk setiap topik
print("Total data:", len(df))
print("Distribusi Topic 1:", pd.Series(df['label_topic1']).value_counts().to_dict())
print("Distribusi Topic 2:", pd.Series(df['label_topic2']).value_counts().to_dict())
print("Distribusi Topic 3:", pd.Series(df['label_topic3']).value_counts().to_dict())

# 2. Split 7:1:2 per aspek
splits = {}
for i in [1, 2, 3]:
    texts = df['content_clean'].tolist()
    labels = df[f'label_topic{i}'].tolist()

    print(f"\n=== TOPIC {i} SPLIT ===")
    print(f"Topic {i} - Original distribution: {pd.Series(labels).value_counts().to_dict()}")

    # Cek apakah bisa stratify (minimal 2 sampel per kelas)
    label_counts = pd.Series(labels).value_counts()
    can_stratify_first = all(count >= 2 for count in label_counts.values)

    # 70% train / 30% temp
    if can_stratify_first:
        X_train, X_temp, y_train, y_temp = train_test_split(
            texts, labels,
            test_size=0.30,
            random_state=42,
            stratify=labels
        )
    else:
        print(f"Topic {i}: Cannot stratify first split (some classes < 2 samples)")
        X_train, X_temp, y_train, y_temp = train_test_split(
            texts, labels,
            test_size=0.30,
            random_state=42
        )

    # Cek apakah bisa stratify untuk split kedua
    temp_label_counts = pd.Series(y_temp).value_counts()
    can_stratify_second = all(count >= 2 for count in temp_label_counts.values)

    # dari temp (30%): 1/3 → val (10%), 2/3 → test (20%)
    if can_stratify_second:
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp,
            test_size=2/3,
            random_state=42,
            stratify=y_temp
        )
    else:
        print(f"Topic {i}: Cannot stratify second split (some classes < 2 samples in temp)")
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp,
            test_size=2/3,
            random_state=42
        )

    splits[f't{i}'] = {
        'X_train': X_train, 'y_train': y_train,
        'X_val': X_val, 'y_val': y_val,
        'X_test': X_test, 'y_test': y_test,
    }

    print(f"Topic {i} - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
    print(f"Topic {i} - Train labels: {pd.Series(y_train).value_counts().to_dict()}")
    print(f"Topic {i} - Val labels: {pd.Series(y_val).value_counts().to_dict()}")
    print(f"Topic {i} - Test labels: {pd.Series(y_test).value_counts().to_dict()}")

Total data: 960
Distribusi Topic 1: {0: 432, 1: 412, 2: 116}
Distribusi Topic 2: {1: 780, 0: 177, 2: 3}
Distribusi Topic 3: {1: 605, 0: 354, 2: 1}

=== TOPIC 1 SPLIT ===
Topic 1 - Original distribution: {0: 432, 1: 412, 2: 116}
Topic 1 - Train: 672, Val: 96, Test: 192
Topic 1 - Train labels: {0: 303, 1: 288, 2: 81}
Topic 1 - Val labels: {0: 43, 1: 41, 2: 12}
Topic 1 - Test labels: {0: 86, 1: 83, 2: 23}

=== TOPIC 2 SPLIT ===
Topic 2 - Original distribution: {1: 780, 0: 177, 2: 3}
Topic 2: Cannot stratify second split (some classes < 2 samples in temp)
Topic 2 - Train: 672, Val: 96, Test: 192
Topic 2 - Train labels: {1: 546, 0: 124, 2: 2}
Topic 2 - Val labels: {1: 76, 0: 20}
Topic 2 - Test labels: {1: 158, 0: 33, 2: 1}

=== TOPIC 3 SPLIT ===
Topic 3 - Original distribution: {1: 605, 0: 354, 2: 1}
Topic 3: Cannot stratify first split (some classes < 2 samples)
Topic 3 - Train: 672, Val: 96, Test: 192
Topic 3 - Train labels: {1: 420, 0: 251, 2: 1}
Topic 3 - Val labels: {1: 62, 0: 34}
Topi

In [None]:
# 3. IndobertDataset definition
MODEL_NAME = "indolem/indobert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class IndobertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        enc = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].flatten(),
            'attention_mask': enc['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Instantiate dataset untuk tiap aspek
print("\n" + "="*50)
print("MEMBUAT DATASET UNTUK SETIAP TOPIK")
print("="*50)

# Topic 1 datasets
train_dataset_t1 = IndobertDataset(texts=splits['t1']['X_train'],
                                   labels=splits['t1']['y_train'],
                                   tokenizer=tokenizer)
val_dataset_t1 = IndobertDataset(texts=splits['t1']['X_val'],
                                 labels=splits['t1']['y_val'],
                                 tokenizer=tokenizer)
test_dataset_t1 = IndobertDataset(texts=splits['t1']['X_test'],
                                  labels=splits['t1']['y_test'],
                                  tokenizer=tokenizer)

# Topic 2 datasets
train_dataset_t2 = IndobertDataset(splits['t2']['X_train'], splits['t2']['y_train'], tokenizer)
val_dataset_t2 = IndobertDataset(splits['t2']['X_val'], splits['t2']['y_val'], tokenizer)
test_dataset_t2 = IndobertDataset(splits['t2']['X_test'], splits['t2']['y_test'], tokenizer)

# Topic 3 datasets
train_dataset_t3 = IndobertDataset(splits['t3']['X_train'], splits['t3']['y_train'], tokenizer)
val_dataset_t3 = IndobertDataset(splits['t3']['X_val'], splits['t3']['y_val'], tokenizer)
test_dataset_t3 = IndobertDataset(splits['t3']['X_test'], splits['t3']['y_test'], tokenizer)

print(f"Dataset Topic 1 - Train: {len(train_dataset_t1)}, Val: {len(val_dataset_t1)}, Test: {len(test_dataset_t1)}")
print(f"Dataset Topic 2 - Train: {len(train_dataset_t2)}, Val: {len(val_dataset_t2)}, Test: {len(test_dataset_t2)}")
print(f"Dataset Topic 3 - Train: {len(train_dataset_t3)}, Val: {len(val_dataset_t3)}, Test: {len(test_dataset_t3)}")

# 5. Ringkasan untuk referensi
datasets = {
    'topic1': {
        'train': train_dataset_t1,
        'val': val_dataset_t1,
        'test': test_dataset_t1
    },
    'topic2': {
        'train': train_dataset_t2,
        'val': val_dataset_t2,
        'test': test_dataset_t2
    },
    'topic3': {
        'train': train_dataset_t3,
        'val': val_dataset_t3,
        'test': test_dataset_t3
    }
}


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


MEMBUAT DATASET UNTUK SETIAP TOPIK
Dataset Topic 1 - Train: 672, Val: 96, Test: 192
Dataset Topic 2 - Train: 672, Val: 96, Test: 192
Dataset Topic 3 - Train: 672, Val: 96, Test: 192


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
import os


In [None]:
# ===================================================================
# FUNGSI COMPUTE METRICS (TANPA ACCURACY)
# ===================================================================

def compute_metrics(p):
    """
    Fungsi untuk menghitung metrik: precision, recall, f1-score, roc-auc
    """
    preds = p.predictions
    labels = p.label_ids

    # Get predicted classes
    pred_classes = np.argmax(preds, axis=1)

    # Precision, Recall, F1-Score
    prec, rec, f1, _ = precision_recall_fscore_support(labels, pred_classes, average='weighted', zero_division=0)

    # Coherence calculation (simplified as consistency metric)
    # Coherence di sini dihitung sebagai konsistensi prediksi dengan confidence
    max_probs = np.max(preds, axis=1)
    coherence = np.mean(max_probs)  # Average confidence as coherence proxy

    return {
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'coherence': coherence
    }

In [None]:
# ===================================================================
# TRAINING TOPIC 1 - PENGALAMAN UMUM PENGGUNAAN LINE
# ===================================================================

print("="*70)
print("TRAINING MODEL TOPIC 1: PENGALAMAN UMUM PENGGUNAAN LINE")
print("="*70)

# Hitung jumlah label unik untuk Topic 1
labels_t1 = [train_dataset_t1[i]['labels'].item() for i in range(len(train_dataset_t1))]
num_labels_t1 = len(set(labels_t1))
print(f"Jumlah label Topic 1: {num_labels_t1}")

# Load model untuk Topic 1
model_topic1 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels_t1,
    problem_type="single_label_classification"
)
model_topic1.to("cuda" if torch.cuda.is_available() else "cpu")

# Training arguments untuk Topic 1
training_args_topic1 = TrainingArguments(
    output_dir="/content/indobert_topic1_model",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_steps=100,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="/content/logs_topic1",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None
)

# Trainer untuk Topic 1
trainer_topic1 = Trainer(
    model=model_topic1,
    args=training_args_topic1,
    train_dataset=train_dataset_t1,
    eval_dataset=val_dataset_t1,
    compute_metrics=compute_metrics
)

# Training Topic 1
print("Memulai training Topic 1...")
trainer_topic1.train()

TRAINING MODEL TOPIC 1: PENGALAMAN UMUM PENGGUNAAN LINE
Jumlah label Topic 1: 3


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memulai training Topic 1...




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mqiyaanugrah75[0m ([33mqiyaanugrah75-universitas-amikom-yogyakarta[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Coherence
1,No log,0.934828,0.409505,0.479167,0.42754,0.684114
2,No log,0.892615,0.491694,0.5625,0.523746,0.886854
3,No log,0.741542,0.735218,0.71875,0.714876,1.190309



=== EVALUASI TOPIC 1 ===


KeyError: 'eval_roc_auc'

In [None]:
# Evaluasi Topic 1
print("\n=== EVALUASI TOPIC 1 ===")
# 1. Evaluate on training set
train_eval_t1 = trainer_topic1.evaluate(train_dataset_t1)
# 2. Evaluate on test set
test_pred_t1 = trainer_topic1.predict(test_dataset_t1)
test_eval_t1 = compute_metrics(test_pred_t1)

# 3. Hasil metrik Topic 1
df_metrics_t1 = pd.DataFrame([
    {
        'split': 'train',
        'precision': train_eval_t1['eval_precision'],
        'recall': train_eval_t1['eval_recall'],
        'f1': train_eval_t1['eval_f1'],
        'coherence': train_eval_t1['eval_coherence']
    },
    {
        'split': 'test',
        'precision': test_eval_t1['precision'],
        'recall': test_eval_t1['recall'],
        'f1': test_eval_t1['f1'],
        'coherence': test_eval_t1['coherence']
    }
])

print("Hasil Evaluasi Topic 1:")
print(df_metrics_t1.round(4))

# Simpan model Topic 1
OUTPUT_DIR_TOPIC1 = "/content/indobert_topic1_model_final"
model_topic1.save_pretrained(OUTPUT_DIR_TOPIC1)
tokenizer.save_pretrained(OUTPUT_DIR_TOPIC1)
print(f"Model Topic 1 disimpan di: {OUTPUT_DIR_TOPIC1}")


=== EVALUASI TOPIC 1 ===


Hasil Evaluasi Topic 1:
   split  precision  recall      f1  coherence
0  train     0.7805  0.7768  0.7768     1.2699
1   test     0.6976  0.6927  0.6917     1.2064
Model Topic 1 disimpan di: /content/indobert_topic1_model_final


In [None]:
print("\n" + "="*70)
print("TRAINING MODEL TOPIC 2: FITUR TAMBAHAN")
print("="*70)

# Hitung jumlah label unik untuk Topic 2
labels_t2 = [train_dataset_t2[i]['labels'].item() for i in range(len(train_dataset_t2))]
num_labels_t2 = len(set(labels_t2))
print(f"Jumlah label Topic 2: {num_labels_t2}")

# Load model untuk Topic 2
model_topic2 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels_t2,
    problem_type="single_label_classification"
)
model_topic2.to("cuda" if torch.cuda.is_available() else "cpu")

# Training arguments untuk Topic 2
training_args_topic2 = TrainingArguments(
    output_dir="/content/indobert_topic2_model",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="/content/logs_topic2",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None
)

# Trainer untuk Topic 2
trainer_topic2 = Trainer(
    model=model_topic2,
    args=training_args_topic2,
    train_dataset=train_dataset_t2,
    eval_dataset=val_dataset_t2,
    compute_metrics=compute_metrics
)

# Training Topic 2
print("Memulai training Topic 2...")
trainer_topic2.train()

# Evaluasi Topic 2
print("\n=== EVALUASI TOPIC 2 ===")
# 1. Evaluate on training set
train_eval_t2 = trainer_topic2.evaluate(train_dataset_t2)
# 2. Evaluate on test set
test_pred_t2 = trainer_topic2.predict(test_dataset_t2)
test_eval_t2 = compute_metrics(test_pred_t2)

# 3. Hasil metrik Topic 2
df_metrics_t2 = pd.DataFrame([
    {
        'split': 'train',
        'precision': train_eval_t2['eval_precision'],
        'recall': train_eval_t2['eval_recall'],
        'f1': train_eval_t2['eval_f1'],
        'coherence': train_eval_t2['eval_coherence']
    },
    {
        'split': 'test',
        'precision': test_eval_t2['precision'],
        'recall': test_eval_t2['recall'],
        'f1': test_eval_t2['f1'],
        'coherence': test_eval_t2['coherence']
    }
])

print("Hasil Evaluasi Topic 2:")
print(df_metrics_t2.round(4))

# Simpan model Topic 2
OUTPUT_DIR_TOPIC2 = "/content/indobert_topic2_model_final"
model_topic2.save_pretrained(OUTPUT_DIR_TOPIC2)
tokenizer.save_pretrained(OUTPUT_DIR_TOPIC2)
print(f"Model Topic 2 disimpan di: {OUTPUT_DIR_TOPIC2}")


TRAINING MODEL TOPIC 2: FITUR TAMBAHAN
Jumlah label Topic 2: 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memulai training Topic 2...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Coherence
1,No log,0.739887,0.625,0.78125,0.694444,0.62902
2,No log,0.554168,0.626736,0.791667,0.699612,1.773801
3,No log,0.48744,0.626736,0.791667,0.699612,2.124637



=== EVALUASI TOPIC 2 ===


Hasil Evaluasi Topic 2:
   split  precision  recall      f1  coherence
0  train     0.6602  0.8125  0.7284     1.7925
1   test     0.6772  0.8229  0.7430     1.8023
Model Topic 2 disimpan di: /content/indobert_topic2_model_final


In [None]:
  # ===================================================================
# TRAINING TOPIC 3 - LOGIN DAN REGISTRASI AKUN
# ===================================================================

print("\n" + "="*70)
print("TRAINING MODEL TOPIC 3: LOGIN DAN REGISTRASI AKUN")
print("="*70)

# Hitung jumlah label unik untuk Topic 3
labels_t3 = [train_dataset_t3[i]['labels'].item() for i in range(len(train_dataset_t3))]
num_labels_t3 = len(set(labels_t3))
print(f"Jumlah label Topic 3: {num_labels_t3}")

# Load model untuk Topic 3
model_topic3 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels_t3,
    problem_type="single_label_classification"
)
model_topic3.to("cuda" if torch.cuda.is_available() else "cpu")

# Training arguments untuk Topic 3
training_args_topic3 = TrainingArguments(
    output_dir="/content/indobert_topic3_model",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="/content/logs_topic3",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=None
)

# Trainer untuk Topic 3
trainer_topic3 = Trainer(
    model=model_topic3,
    args=training_args_topic3,
    train_dataset=train_dataset_t3,
    eval_dataset=val_dataset_t3,
    compute_metrics=compute_metrics
)

# Training Topic 3
print("Memulai training Topic 3...")
trainer_topic3.train()

# Evaluasi Topic 3
print("\n=== EVALUASI TOPIC 3 ===")
# 1. Evaluate on training set
train_eval_t3 = trainer_topic3.evaluate(train_dataset_t3)
# 2. Evaluate on test set
test_pred_t3 = trainer_topic3.predict(test_dataset_t3)
test_eval_t3 = compute_metrics(test_pred_t3)

# 3. Hasil metrik Topic 3
df_metrics_t3 = pd.DataFrame([
    {
        'split': 'train',
        'precision': train_eval_t3['eval_precision'],
        'recall': train_eval_t3['eval_recall'],
        'f1': train_eval_t3['eval_f1'],
        'coherence': train_eval_t3['eval_coherence']
    },
    {
        'split': 'test',
        'precision': test_eval_t3['precision'],
        'recall': test_eval_t3['recall'],
        'f1': test_eval_t3['f1'],
        'coherence': test_eval_t3['coherence']
    }
])

print("Hasil Evaluasi Topic 3:")
print(df_metrics_t3.round(4))

# Simpan model Topic 3
OUTPUT_DIR_TOPIC3 = "/content/indobert_topic3_model_final"
model_topic3.save_pretrained(OUTPUT_DIR_TOPIC3)
tokenizer.save_pretrained(OUTPUT_DIR_TOPIC3)
print(f"Model Topic 3 disimpan di: {OUTPUT_DIR_TOPIC3}")


TRAINING MODEL TOPIC 3: LOGIN DAN REGISTRASI AKUN
Jumlah label Topic 3: 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memulai training Topic 3...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Coherence
1,No log,0.879414,0.59581,0.635417,0.535374,0.566343
2,No log,0.664555,0.757576,0.708333,0.643333,1.281727
3,No log,0.567295,0.794185,0.697917,0.614232,1.816223



=== EVALUASI TOPIC 3 ===


Hasil Evaluasi Topic 3:
   split  precision  recall      f1  coherence
0  train     0.6988  0.6711  0.5997     1.2410
1   test     0.7027  0.6823  0.6114     1.2429
Model Topic 3 disimpan di: /content/indobert_topic3_model_final


In [None]:

import re
import pickle
import os
from typing import List, Union

class TextPreprocessor:
    """
    Text Preprocessor yang bisa disimpan sebagai model untuk deployment
    Mengganti dependency emoji dengan regex pattern sederhana
    """

    def __init__(self):
        # Emoji patterns yang umum (tanpa dependency emoji library)
        self.emoji_patterns = [
            r'[\U0001F600-\U0001F64F]',  # emoticons
            r'[\U0001F300-\U0001F5FF]',  # symbols & pictographs
            r'[\U0001F680-\U0001F6FF]',  # transport & map symbols
            r'[\U0001F1E0-\U0001F1FF]',  # flags (iOS)
            r'[\U00002702-\U000027B0]',  # Dingbats
            r'[\U000024C2-\U0001F251]',  # Enclosed characters
            r'[\U0001F900-\U0001F9FF]',  # Supplemental Symbols and Pictographs
            r'[\U0001FA70-\U0001FAFF]',  # Symbols and Pictographs Extended-A
            r'[\U00002600-\U000026FF]',  # Miscellaneous Symbols
            r'[\U00002700-\U000027BF]',  # Dingbats
        ]
        self.compiled_emoji_pattern = re.compile('|'.join(self.emoji_patterns))

        # Pattern untuk preprocessing lainnya
        self.url_pattern = re.compile(r"http\S+|www\S+|https\S+")
        self.number_pattern = re.compile(r"\d+")
        self.non_alpha_pattern = re.compile(r'[^a-zA-Z0-9\s]')
        self.multiple_space_pattern = re.compile(r"\s+")

    def clean_text(self, text: Union[str, None]) -> str:
        """
        Membersihkan teks dengan langkah-langkah preprocessing
        """
        if not isinstance(text, str) or text is None:
            return ""

        # 1. lowercase
        text = text.lower()

        # 2. hapus URL
        text = self.url_pattern.sub("", text)

        # 3. hapus emoji (tanpa library emoji)
        text = self.compiled_emoji_pattern.sub("", text)

        # 4. hapus angka
        text = self.number_pattern.sub("", text)

        # 5. hapus karakter non-alfabet
        text = self.non_alpha_pattern.sub('', text)

        # 6. hapus spasi ganda
        text = self.multiple_space_pattern.sub(" ", text).strip()

        return text

    def clean_batch(self, texts: List[str]) -> List[str]:
        """
        Membersihkan batch teks sekaligus
        """
        return [self.clean_text(text) for text in texts]

    def save_model(self, filepath: str):
        """
        Simpan preprocessor sebagai pickle file
        """
        with open(filepath, 'wb') as f:
            pickle.dump(self, f)
        print(f"TextPreprocessor model disimpan di: {filepath}")

    @staticmethod
    def load_model(filepath: str):
        """
        Load preprocessor dari pickle file
        """
        with open(filepath, 'rb') as f:
            preprocessor = pickle.load(f)
        print(f"TextPreprocessor model dimuat dari: {filepath}")
        return preprocessor

    def __repr__(self):
        return "TextPreprocessor(emoji_free=True, url_free=True, number_free=True)"

# ===================================================================
# FUNGSI UNTUK MEMBUAT DAN MENYIMPAN MODEL PREPROCESSOR
# ===================================================================

def create_and_save_preprocessor(save_path: str = "/content/text_preprocessor.pkl"):
    """
    Membuat dan menyimpan TextPreprocessor model
    """
    print("Membuat TextPreprocessor model...")

    # Buat instance preprocessor
    preprocessor = TextPreprocessor()

    # Test dengan beberapa contoh
    test_texts = [
        "Halo! Ini adalah contoh teks 123 😀 dengan emoji dan angka",
        "Cek website ini: https://www.google.com untuk info lebih lanjut!!!",
        "WOW!!! Aplikasi ini BAGUS BANGET 👍👍👍 rating 5/5 ⭐⭐⭐⭐⭐",
        "gak bisa login nih 😭😭 error terus"
    ]

    print("\n=== TEST PREPROCESSING ===")
    for i, text in enumerate(test_texts, 1):
        cleaned = preprocessor.clean_text(text)
        print(f"Original {i}: {text}")
        print(f"Cleaned {i}:  {cleaned}")
        print()

    # Simpan model
    preprocessor.save_model(save_path)

    return preprocessor

# ===================================================================
# DEMO PENGGUNAAN UNTUK STREAMLIT
# ===================================================================

def demo_streamlit_usage():
    """
    Demo cara penggunaan di Streamlit
    """
    print("=== DEMO PENGGUNAAN DI STREAMLIT ===")

    # Simulasi load model di Streamlit
    preprocessor_path = "/content/text_preprocessor.pkl"

    if os.path.exists(preprocessor_path):
        # Load model
        preprocessor = TextPreprocessor.load_model(preprocessor_path)

        # Contoh input user
        user_input = "Aplikasi LINE bagus banget! 😍😍 tapi fitur call suka error 😤 rating 4/5 ⭐⭐⭐⭐"

        # Preprocessing
        cleaned_input = preprocessor.clean_text(user_input)

        print(f"Input user: {user_input}")
        print(f"Hasil preprocessing: {cleaned_input}")

        return preprocessor, cleaned_input
    else:
        print("Model preprocessor belum dibuat. Jalankan create_and_save_preprocessor() terlebih dahulu.")
        return None, None
