In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, classification_report
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings('ignore')

# 1. Cek ketersediaan PyTorch dan GPU
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))

# 2. Load dataset (ganti dengan path dataset Anda)
# Diasumsikan dataset memiliki kolom 'text' dan 'label'
df = pd.read_csv('train.csv')  # Ganti dengan nama file Anda

# Pastikan label di-encode menjadi numerik
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# 3. Split data: 80% berlabel, 20% tanpa label (simulasi semi-supervised)
df_labeled, df_unlabeled = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=42
)
df_unlabeled = df_unlabeled.copy()
df_unlabeled['label'] = np.nan  # Hapus label untuk data unlabeled

print(f"Data berlabel: {len(df_labeled)} sampel")
print(f"Data tanpa label: {len(df_unlabeled)} sampel")

# 4. Preprocessing dan vektorisasi teks
# Hapus stop_words atau gunakan stop words Indonesia jika tersedia
vectorizer = TfidfVectorizer(max_features=5000)  # stop_words dihapus
X_labeled = vectorizer.fit_transform(df_labeled['text'])
y_labeled = df_labeled['label']

# Split data berlabel menjadi train dan test
X_train, X_test, y_train, y_test = train_test_split(
    X_labeled, y_labeled, test_size=0.2, stratify=y_labeled, random_state=42
)

# 5. Latih model baseline (Logistic Regression)
print("\n=== Training Baseline Model ===")
baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)

# Hitung metrik - PERBAIKAN DI SINI
# Gunakan probabilitas kelas yang benar untuk RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_proba[range(len(y_test)), y_test]))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Baseline Model Results:")
print(f"RMSE: {rmse:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 6. Persiapan data untuk model PyTorch
class TextClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Konversi data ke format PyTorch
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Buat DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 7. Training model PyTorch
print("\n=== Training PyTorch Model ===")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size = X_train.shape[1]
num_classes = len(y_train.unique())
model = TextClassifier(input_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

# 8. Evaluasi model PyTorch
model.eval()
with torch.no_grad():
    X_test_device = X_test_tensor.to(device)
    outputs = model(X_test_device)
    _, y_pred_torch = torch.max(outputs, 1)
    y_pred_proba_torch = torch.softmax(outputs, dim=1).cpu().numpy()

# Hitung metrik - PERBAIKAN DI SINI
rmse_torch = np.sqrt(mean_squared_error(y_test, y_pred_proba_torch[range(len(y_test)), y_test]))
accuracy_torch = accuracy_score(y_test, y_pred_torch.cpu().numpy())
f1_torch = f1_score(y_test, y_pred_torch.cpu().numpy(), average='weighted')

print(f"\nPyTorch Model Results:")
print(f"RMSE: {rmse_torch:.4f}")
print(f"Accuracy: {accuracy_torch:.4f}")
print(f"F1-Score: {f1_torch:.4f}")

# 9. Self-training dengan data unlabeled (semi-supervised learning)
print("\n=== Self-Training dengan Data Unlabeled ===")
# Vektorisasi data unlabeled
X_unlabeled = vectorizer.transform(df_unlabeled['text'])
X_unlabeled_tensor = torch.tensor(X_unlabeled.toarray(), dtype=torch.float32).to(device)

# Prediksi pseudo-label
model.eval()
with torch.no_grad():
    outputs_unlabeled = model(X_unlabeled_tensor)
    _, pseudo_labels = torch.max(outputs_unlabeled, 1)

# Gabungkan data berlabel dan pseudo-label
X_combined = torch.cat([X_train_tensor, X_unlabeled_tensor.cpu()], dim=0)
y_combined = torch.cat([y_train_tensor, pseudo_labels.cpu()], dim=0)

# Latih ulang model dengan data gabungan
combined_dataset = TensorDataset(X_combined, y_combined)
combined_loader = DataLoader(combined_dataset, batch_size=32, shuffle=True)

model2 = TextClassifier(input_size, num_classes).to(device)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model2.train()
    total_loss = 0
    
    for batch_X, batch_y in combined_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer2.zero_grad()
        outputs = model2(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer2.step()
        
        total_loss += loss.item()
    
    print(f'Self-Training Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(combined_loader):.4f}')

# Evaluasi model setelah self-training
model2.eval()
with torch.no_grad():
    X_test_device = X_test_tensor.to(device)
    outputs = model2(X_test_device)
    _, y_pred_combined = torch.max(outputs, 1)
    y_pred_proba_combined = torch.softmax(outputs, dim=1).cpu().numpy()

# Hitung metrik - PERBAIKAN DI SINI
rmse_combined = np.sqrt(mean_squared_error(y_test, y_pred_proba_combined[range(len(y_test)), y_test]))
accuracy_combined = accuracy_score(y_test, y_pred_combined.cpu().numpy())
f1_combined = f1_score(y_test, y_pred_combined.cpu().numpy(), average='weighted')

print(f"\nSelf-Trained Model Results:")
print(f"RMSE: {rmse_combined:.4f}")
print(f"Accuracy: {accuracy_combined:.4f}")
print(f"F1-Score: {f1_combined:.4f}")

# 10. Ringkasan hasil
print("\n=== RINGKASAN HASIL ===")
print(f"{'Model':<20} {'RMSE':<10} {'Accuracy':<10} {'F1-Score':<10}")
print("-" * 50)
print(f"{'Baseline (LR)':<20} {rmse:<10.4f} {accuracy:<10.4f} {f1:<10.4f}")
print(f"{'PyTorch':<20} {rmse_torch:<10.4f} {accuracy_torch:<10.4f} {f1_torch:<10.4f}")
print(f"{'Self-Trained':<20} {rmse_combined:<10.4f} {accuracy_combined:<10.4f} {f1_combined:<10.4f}")

PyTorch version: 2.7.1+cpu
CUDA available: False
Data berlabel: 4000 sampel
Data tanpa label: 1000 sampel

=== Training Baseline Model ===
Baseline Model Results:
RMSE: 3.9331
Accuracy: 0.6212
F1-Score: 0.5938

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.69      0.72       106
           1       0.33      0.14      0.20        29
           2       0.58      0.83      0.68       195
           3       0.48      0.18      0.26        56
           4       0.65      0.22      0.33        49
           5       0.72      0.88      0.79       171
           6       0.51      0.47      0.49       124
           7       0.56      0.41      0.48        70

    accuracy                           0.62       800
   macro avg       0.57      0.48      0.49       800
weighted avg       0.61      0.62      0.59       800


=== Training PyTorch Model ===
Epoch [1/10], Loss: 1.9293
Epoch [2/10], Loss: 1.5231
Epoch [3/10], Loss: 1.1248
Ep

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, classification_report
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

# 1. Cek ketersediaan PyTorch dan GPU
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# 2. Load dataset
df = pd.read_csv('train.csv')

# Pastikan label di-encode menjadi numerik
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# 3. Balance dataset - buat semua pilar memiliki jumlah sampel yang sama
print("\n=== Balancing Dataset ===")
min_samples = df['label'].value_counts().min()
print(f"Jumlah sampel terkecil per pilar: {min_samples}")

balanced_dfs = []
for label in df['label'].unique():
    label_df = df[df['label'] == label]
    if len(label_df) > min_samples:
        # Downsample jika lebih banyak
        label_df = resample(label_df, 
                          n_samples=min_samples, 
                          random_state=42, 
                          replace=False)
    balanced_dfs.append(label_df)

df_balanced = pd.concat(balanced_dfs)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Data sebelum balancing: {len(df)} sampel")
print(f"Data setelah balancing: {len(df_balanced)} sampel")
print("Distribusi label setelah balancing:")
print(df_balanced['label'].value_counts())

# 4. Split data balanced
df_labeled, df_unlabeled = train_test_split(
    df_balanced, test_size=0.2, stratify=df_balanced['label'], random_state=42
)
df_unlabeled = df_unlabeled.copy()
df_unlabeled['label'] = np.nan

print(f"\nData berlabel: {len(df_labeled)} sampel")
print(f"Data tanpa label: {len(df_unlabeled)} sampel")

# 5. Hitung class weights
class_counts = df_labeled['label'].value_counts().sort_index()
total_samples = len(df_labeled)
class_weights = total_samples / (len(class_counts) * class_counts)
class_weights_tensor = torch.tensor(class_weights.values, dtype=torch.float32).to(device)

print(f"\nClass weights: {dict(zip(class_counts.index, class_weights))}")

# 6. Persiapan dataset untuk IndoBERT
class IndoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 7. Load IndoBERT tokenizer dan model
print("\n=== Loading IndoBERT ===")
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Split data untuk IndoBERT
X_train_text = df_labeled['text']
y_train_text = df_labeled['label']
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(
    X_train_text, y_train_text, test_size=0.2, stratify=y_train_text, random_state=42
)

# Buat dataset
train_dataset = IndoBERTDataset(X_train_bert, y_train_bert, tokenizer)
test_dataset = IndoBERTDataset(X_test_bert, y_test_bert, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 8. Training IndoBERT dengan manual training loop
print("\n=== Training IndoBERT Model ===")

num_classes = len(df_labeled['label'].unique())
bert_model = BertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_classes
).to(device)

# Optimizer dan scheduler
optimizer = AdamW(bert_model.parameters(), lr=2e-5, weight_decay=0.01)
total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training function
def train_bert_model(model, train_loader, optimizer, scheduler, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            # Backward pass
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            if batch_idx % 10 == 0:
                print(f'Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}')
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

# Train the model
train_bert_model(bert_model, train_loader, optimizer, scheduler, num_epochs=3)

# 9. Evaluasi IndoBERT
print("\n=== Evaluating IndoBERT ===")
bert_model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        predictions = torch.argmax(outputs.logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy_bert = accuracy_score(all_labels, all_predictions)
f1_bert = f1_score(all_labels, all_predictions, average='weighted')

print(f"IndoBERT Results:")
print(f"Accuracy: {accuracy_bert:.4f}")
print(f"F1-Score: {f1_bert:.4f}")
print("\nClassification Report:")
print(classification_report(all_labels, all_predictions, target_names=label_encoder.classes_))

# 10. Self-training dengan IndoBERT untuk data unlabeled
print("\n=== Self-Training dengan IndoBERT ===")

# Function untuk predict probabilities
def predict_proba(texts, model, tokenizer, batch_size=16):
    """Predict probabilities for a list of texts"""
    model.eval()
    all_probs = []
    
    # Create dataset
    class UnlabeledDataset(Dataset):
        def __init__(self, texts, tokenizer, max_length=128):
            self.texts = texts
            self.tokenizer = tokenizer
            self.max_length = max_length
            
        def __len__(self):
            return len(self.texts)
        
        def __getitem__(self, idx):
            text = str(self.texts[idx])
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten()
            }
    
    dataset = UnlabeledDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.softmax(outputs.logits, dim=1)
            all_probs.extend(probs.cpu().numpy())
    
    return np.array(all_probs)

# Prediksi data unlabeled
unlabeled_texts = df_unlabeled['text'].tolist()
if unlabeled_texts:
    pseudo_probs = predict_proba(unlabeled_texts, bert_model, tokenizer)
    pseudo_labels = np.argmax(pseudo_probs, axis=1)
    
    # Filter pseudo-labels dengan confidence tinggi
    confidence_threshold = 0.8
    high_confidence_mask = pseudo_probs.max(axis=1) >= confidence_threshold
    
    print(f"Data unlabeled: {len(unlabeled_texts)}")
    print(f"Data dengan confidence tinggi: {high_confidence_mask.sum()}")
    
    # Gabungkan dengan data labeled jika ada high confidence samples
    if high_confidence_mask.sum() > 0:
        high_conf_texts = [unlabeled_texts[i] for i in range(len(unlabeled_texts)) if high_confidence_mask[i]]
        high_conf_labels = [pseudo_labels[i] for i in range(len(pseudo_labels)) if high_confidence_mask[i]]
        
        # Buat dataset gabungan
        combined_texts = X_train_text.tolist() + high_conf_texts
        combined_labels = y_train_text.tolist() + high_conf_labels
        
        combined_train_dataset = IndoBERTDataset(
            pd.Series(combined_texts), 
            pd.Series(combined_labels), 
            tokenizer
        )
        
        combined_train_loader = DataLoader(combined_train_dataset, batch_size=16, shuffle=True)
        
        # Train ulang dengan data gabungan
        bert_model_combined = BertForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=num_classes
        ).to(device)
        
        optimizer_combined = AdamW(bert_model_combined.parameters(), lr=2e-5, weight_decay=0.01)
        total_steps_combined = len(combined_train_loader) * 2  # 2 epochs
        scheduler_combined = get_linear_schedule_with_warmup(
            optimizer_combined, 
            num_warmup_steps=0,
            num_training_steps=total_steps_combined
        )
        
        # Train combined model
        train_bert_model(bert_model_combined, combined_train_loader, optimizer_combined, scheduler_combined, num_epochs=2)
        
        # Evaluasi model combined
        bert_model_combined.eval()
        all_predictions_combined = []
        all_labels_combined = []
        
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = bert_model_combined(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
                
                predictions = torch.argmax(outputs.logits, dim=1)
                all_predictions_combined.extend(predictions.cpu().numpy())
                all_labels_combined.extend(labels.cpu().numpy())
        
        accuracy_combined = accuracy_score(all_labels_combined, all_predictions_combined)
        f1_combined = f1_score(all_labels_combined, all_predictions_combined, average='weighted')
        
        print(f"\nIndoBERT After Self-Training:")
        print(f"Accuracy: {accuracy_combined:.4f}")
        print(f"F1-Score: {f1_combined:.4f}")

# 11. Ringkasan hasil
print("\n=== RINGKASAN HASIL ===")
print(f"{'Model':<25} {'Accuracy':<10} {'F1-Score':<10}")
print("-" * 45)
print(f"{'IndoBERT':<25} {accuracy_bert:<10.4f} {f1_bert:<10.4f}")

if 'accuracy_combined' in locals():
    print(f"{'IndoBERT + Self-Training':<25} {accuracy_combined:<10.4f} {f1_combined:<10.4f}")

print(f"\nDataset balance information:")
print(f"Total balanced samples: {len(df_balanced)}")
print(f"Samples per class: {min_samples}")
print(f"Number of classes: {num_classes}")

PyTorch version: 2.7.1+cpu
CUDA available: False

=== Balancing Dataset ===
Jumlah sampel terkecil per pilar: 182
Data sebelum balancing: 5000 sampel
Data setelah balancing: 1456 sampel
Distribusi label setelah balancing:
label
3    182
6    182
4    182
7    182
5    182
1    182
2    182
0    182
Name: count, dtype: int64

Data berlabel: 1164 sampel
Data tanpa label: 292 sampel

Class weights: {0: 0.9965753424657534, 1: 0.9965753424657534, 2: 1.0034482758620689, 3: 1.0034482758620689, 4: 1.0034482758620689, 5: 0.9965753424657534, 6: 1.0034482758620689, 7: 0.9965753424657534}

=== Loading IndoBERT ===

=== Training IndoBERT Model ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Batch 0, Loss: 2.0477
Epoch 1, Batch 10, Loss: 2.1392
Epoch 1, Batch 20, Loss: 2.0174
Epoch 1, Batch 30, Loss: 1.8460
Epoch 1, Batch 40, Loss: 1.7604
Epoch 1, Batch 50, Loss: 1.8105
Epoch 1, Average Loss: 1.9360
Epoch 2, Batch 0, Loss: 1.5338
Epoch 2, Batch 10, Loss: 1.6116
Epoch 2, Batch 20, Loss: 1.4490
Epoch 2, Batch 30, Loss: 1.5346
Epoch 2, Batch 40, Loss: 1.3165
Epoch 2, Batch 50, Loss: 0.8454
Epoch 2, Average Loss: 1.4191
Epoch 3, Batch 0, Loss: 1.3090
Epoch 3, Batch 10, Loss: 0.9628
Epoch 3, Batch 20, Loss: 1.1697
Epoch 3, Batch 30, Loss: 0.9242
Epoch 3, Batch 40, Loss: 1.2696
Epoch 3, Batch 50, Loss: 1.2295
Epoch 3, Average Loss: 1.1425

=== Evaluating IndoBERT ===
IndoBERT Results:
Accuracy: 0.5107
F1-Score: 0.5014

Classification Report:
              precision    recall  f1-score   support

     harmoni       0.61      0.48      0.54        29
  hilirisasi       0.44      0.52      0.48        29
    ideologi       0.54      0.69      0.61        29
   pekerjaan   

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Batch 0, Loss: 2.1331
Epoch 1, Batch 10, Loss: 2.0613
Epoch 1, Batch 20, Loss: 1.9908
Epoch 1, Batch 30, Loss: 1.9802
Epoch 1, Batch 40, Loss: 1.8666
Epoch 1, Batch 50, Loss: 1.7567
Epoch 1, Batch 60, Loss: 1.5886
Epoch 1, Batch 70, Loss: 1.5384
Epoch 1, Average Loss: 1.8772
Epoch 2, Batch 0, Loss: 1.5390
Epoch 2, Batch 10, Loss: 1.4970
Epoch 2, Batch 20, Loss: 1.5992
Epoch 2, Batch 30, Loss: 1.2285
Epoch 2, Batch 40, Loss: 1.3774
Epoch 2, Batch 50, Loss: 1.3646
Epoch 2, Batch 60, Loss: 1.5065
Epoch 2, Batch 70, Loss: 1.4177
Epoch 2, Average Loss: 1.3500

IndoBERT After Self-Training:
Accuracy: 0.6094
F1-Score: 0.6066

=== RINGKASAN HASIL ===
Model                     Accuracy   F1-Score  
---------------------------------------------
IndoBERT                  0.5107     0.5014    
IndoBERT + Self-Training  0.6094     0.6066    

Dataset balance information:
Total balanced samples: 1456
Samples per class: 182
Number of classes: 8


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 2. Load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    # Clean text data
    df['text'] = df['text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
    df['text'] = df['text'].str.replace(r'\d+', '', regex=True)
    df['text'] = df['text'].str.strip()
    df = df[df['text'].str.len() > 10]  # Remove very short texts
    
    # Encode labels
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['label'])
    
    return df, label_encoder

df, label_encoder = load_and_preprocess_data('train.csv')

# 3. Advanced balancing with augmentation
def balance_dataset(df, target_samples_per_class=2000):
    balanced_dfs = []
    
    for label in df['label'].unique():
        label_df = df[df['label'] == label].copy()
        
        if len(label_df) < target_samples_per_class:
            # Upsample with replacement
            label_df = resample(label_df,
                              n_samples=target_samples_per_class,
                              random_state=42,
                              replace=True,
                              stratify=label_df['label'] if len(label_df) > 1 else None)
        else:
            # Downsample
            label_df = resample(label_df,
                              n_samples=target_samples_per_class,
                              random_state=42,
                              replace=False)
        
        balanced_dfs.append(label_df)
    
    balanced_df = pd.concat(balanced_dfs)
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return balanced_df

balanced_df = balance_dataset(df, target_samples_per_class=2000)
print("Dataset balanced:", balanced_df['label'].value_counts())

# 4. Enhanced IndoBERT Dataset with data augmentation
class EnhancedIndoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256, augment=False):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        # Simple text augmentation
        if self.augment and np.random.random() > 0.7:
            words = text.split()
            if len(words) > 5:
                # Random swap
                if np.random.random() > 0.5 and len(words) > 2:
                    i, j = np.random.choice(len(words), 2, replace=False)
                    words[i], words[j] = words[j], words[i]
                    text = ' '.join(words)
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 5. Improved training function with early stopping and validation
def train_bert_model_with_validation(model, train_loader, val_loader, optimizer, scheduler, num_epochs=10, patience=3):
    model.train()
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses = []
    val_losses = []
    val_accuracies = []
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_train_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                total_val_loss += outputs.loss.item()
                
                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_accuracies.append(val_accuracy)
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            # Save best model
            torch.save(model.state_dict(), 'best_indobert_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break
    
    return train_losses, val_losses, val_accuracies

# 6. Cross-validation training
def cross_validate_bert(df, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    texts = df['text']
    labels = df['label']
    
    all_accuracies = []
    all_f1_scores = []
    
    model_name = "indobenchmark/indobert-base-p1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
        print(f"\n=== Fold {fold + 1}/{n_splits} ===")
        
        # Split data
        X_train, X_val = texts.iloc[train_idx], texts.iloc[val_idx]
        y_train, y_val = labels.iloc[train_idx], labels.iloc[val_idx]
        
        # Create datasets
        train_dataset = EnhancedIndoBERTDataset(X_train, y_train, tokenizer, augment=True)
        val_dataset = EnhancedIndoBERTDataset(X_val, y_val, tokenizer, augment=False)
        
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
        
        # Initialize model
        num_classes = len(df['label'].unique())
        model = BertForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=num_classes
        ).to(device)
        
        # Optimizer with weight decay
        optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
        
        # Scheduler
        total_steps = len(train_loader) * 10
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=int(0.1 * total_steps),
            num_training_steps=total_steps
        )
        
        # Train
        train_losses, val_losses, val_accuracies = train_bert_model_with_validation(
            model, train_loader, val_loader, optimizer, scheduler, num_epochs=10
        )
        
        # Load best model for evaluation
        model.load_state_dict(torch.load('best_indobert_model.pth'))
        model.eval()
        
        # Final evaluation
        all_preds = []
        all_true = []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)
                
                all_preds.extend(preds.cpu().numpy())
                all_true.extend(labels.cpu().numpy())
        
        accuracy = accuracy_score(all_true, all_preds)
        f1 = f1_score(all_true, all_preds, average='weighted')
        
        all_accuracies.append(accuracy)
        all_f1_scores.append(f1)
        
        print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")
        print(classification_report(all_true, all_preds, target_names=label_encoder.classes_))
    
    print(f"\n=== Cross-Validation Results ===")
    print(f"Average Accuracy: {np.mean(all_accuracies):.4f} (+/- {np.std(all_accuracies):.4f})")
    print(f"Average F1-Score: {np.mean(all_f1_scores):.4f} (+/- {np.std(all_f1_scores):.4f})")
    
    return np.mean(all_accuracies), np.mean(all_f1_scores)

# 7. Train final model on all data
def train_final_model(df):
    model_name = "indobenchmark/indobert-base-p1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Split final train/val
    X_train, X_val, y_train, y_val = train_test_split(
        df['text'], df['label'], test_size=0.1, stratify=df['label'], random_state=42
    )
    
    train_dataset = EnhancedIndoBERTDataset(X_train, y_train, tokenizer, augment=True)
    val_dataset = EnhancedIndoBERTDataset(X_val, y_val, tokenizer, augment=False)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    
    num_classes = len(df['label'].unique())
    model = BertForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=num_classes
    ).to(device)
    
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    total_steps = len(train_loader) * 15
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )
    
    print("=== Training Final Model ===")
    train_losses, val_losses, val_accuracies = train_bert_model_with_validation(
        model, train_loader, val_loader, optimizer, scheduler, num_epochs=15, patience=5
    )
    
    # Load best model
    model.load_state_dict(torch.load('best_indobert_model.pth'))
    
    return model, tokenizer

# 8. Main execution
print("Starting enhanced training...")

# Cross-validation first
avg_accuracy, avg_f1 = cross_validate_bert(balanced_df, n_splits=3)

if avg_accuracy > 0.8:  # If CV results are good, train final model
    print("\nCV results are good, training final model...")
    final_model, tokenizer = train_final_model(balanced_df)
    
    # Save final model
    torch.save(final_model.state_dict(), 'final_indobert_model.pth')
    tokenizer.save_pretrained('./tokenizer')
    
    print("Final model trained and saved successfully!")
else:
    print("\nCV results are poor. Consider:")
    print("1. Adding more data")
    print("2. Trying different model architecture")
    print("3. Checking data quality")
    print("4. Adjusting hyperparameters")

Using device: cpu
Dataset balanced: label
1    2000
3    2000
7    2000
2    2000
4    2000
0    2000
5    2000
6    2000
Name: count, dtype: int64
Starting enhanced training...

=== Fold 1/3 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
