In [None]:
from google.colab import drive
drive.mount('/content/drive')

folder_path = '/content/drive/MyDrive/archive'

data_path = folder_path + '/data.csv'
kamusalay_path = folder_path + '/new_kamusalay.csv'
abusive_path = folder_path + '/abusive.csv'


Mounted at /content/drive


# **Load Dataset & Kamus**

In [None]:
import pandas as pd
import numpy as np

# Load dataset utama
df = pd.read_csv(data_path,  encoding='ISO-8859-1')  # Kolom: Tweet, HS, Abusive
df_kamus_alay = pd.read_csv(kamusalay_path, encoding='ISO-8859-1')

# Load kamus abusive dan alay
kamus_abusive = pd.read_csv(abusive_path)['ABUSIVE'].tolist()  # Ganti 'ABUSIVE' jika nama kolom berbeda
kamus_alay = dict(zip(df_kamus_alay['kata_alay'], df_kamus_alay['kata_baku']))

# Print contoh data
print("Contoh data:\n", df.head())
print("\nKamus abusive (5 kata pertama):", kamus_abusive[:5])
print("\nKamus alay (5 pasangan pertama):", list(kamus_alay.items())[:5])


Contoh data:
                                                Tweet  HS  Abusive  \
0  - disaat semua cowok berusaha melacak perhatia...   1        1   
1  RT USER: USER siapa yang telat ngasih tau elu?...   0        1   
2  41. Kadang aku berfikir, kenapa aku tetap perc...   0        0   
3  USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...   0        0   
4  USER USER Kaum cebong kapir udah keliatan dong...   1        1   

   HS_Individual  HS_Group  HS_Religion  HS_Race  HS_Physical  HS_Gender  \
0              1         0            0        0            0          0   
1              0         0            0        0            0          0   
2              0         0            0        0            0          0   
3              0         0            0        0            0          0   
4              0         1            1        0            0          0   

   HS_Other  HS_Weak  HS_Moderate  HS_Strong  
0         1        1            0          0  
1         0        0

# **Preprocessing Teks**

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

def preprocess_text(text):
    # Lowercase & hapus karakter khusus
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenisasi & hapus stopwords
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))  # Ganti 'indonesian' jika perlu
    tokens = [word for word in tokens if word not in stop_words]

    # Ganti kata alay dengan bentuk formal (pakai kamus)
    for i, word in enumerate(tokens):
        if word in kamus_alay:
            tokens[i] = kamus_alay[word]  # Asumsi kamus_alay = {'kata_alay': 'kata_baku'}

    return ' '.join(tokens)

df['cleaned_text'] = df['Tweet'].apply(preprocess_text)
print("\nContoh hasil preprocessing:\n", df[['Tweet', 'cleaned_text']].head())

# Simpan teks asli
texts = df['cleaned_text'].values

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Contoh hasil preprocessing:
                                                Tweet  \
0  - disaat semua cowok berusaha melacak perhatia...   
1  RT USER: USER siapa yang telat ngasih tau elu?...   
2  41. Kadang aku berfikir, kenapa aku tetap perc...   
3  USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...   
4  USER USER Kaum cebong kapir udah keliatan dong...   

                                        cleaned_text  
0  di saat cowok berusaha melacak perhatian gue k...  
1  rt pengguna pengguna telat memberi tau eluedan...  
2  41 kadang berpikir percaya tuhan jatuh berkali...  
3  pengguna pengguna akunnku tau matamu sipit dil...  
4  pengguna pengguna kaum cebong kafir sudah keli...  


# **Ekstraksi Fitur (TF-IDF + Kamus)**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['cleaned_text']).toarray()

# Fitur tambahan dari kamus abusive
def count_abusive_words(text):
    return sum(1 for word in text.split() if word in kamus_abusive)

df['abusive_count'] = df['cleaned_text'].apply(count_abusive_words)
X_abusive = df['abusive_count'].values.reshape(-1, 1)

# Gabungkan fitur TF-IDF dengan fitur abusive_count
X = np.concatenate([X_tfidf, X_abusive], axis=1)

# Ambil label asli (HS, Abusive)
y_base = df[['HS', 'Abusive']].values.astype(int)

# Konversi ke single-label 4 kelas:
# 0: Normal (HS=0, Abusive=0)
# 1: HS saja (HS=1, Abusive=0)
# 2: Abusive saja (HS=0, Abusive=1)
# 3: HS + Abusive (HS=1, Abusive=1)
y_4class = y_base[:, 0] * 2 + y_base[:, 1]

# Split data sebelum SMOTE
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y_4class, np.arange(len(df)), test_size=0.2, stratify=y_4class, random_state=42
)
X_train_text = texts[idx_train]
X_test_text = texts[idx_test]

# Distribusi kelas data training dan testing
from collections import Counter
train_dist = Counter(y_train)
test_dist = Counter(y_test)

label_map = {
    0: "Normal (HS=0, Abusive=0)",
    1: "Abusive saja (HS=0, Abusive=1)",
    2: "HS saja (HS=1, Abusive=0)",
    3: "HS + Abusive (HS=1, Abusive=1)"
}

print("\nDistribusi Data Training Sebelum SMOTE:")
for label_id, label_name in label_map.items():
    print(f"{label_name}: {train_dist.get(label_id, 0)}")

print("\nDistribusi Data Testing:")
for label_id, label_name in label_map.items():
    print(f"{label_name}: {test_dist.get(label_id, 0)}")


Distribusi Data Training Sebelum SMOTE:
Normal (HS=0, Abusive=0): 4688
Abusive saja (HS=0, Abusive=1): 1398
HS saja (HS=1, Abusive=0): 1813
HS + Abusive (HS=1, Abusive=1): 2636

Distribusi Data Testing:
Normal (HS=0, Abusive=0): 1172
Abusive saja (HS=0, Abusive=1): 350
HS saja (HS=1, Abusive=0): 453
HS + Abusive (HS=1, Abusive=1): 659


# **SMOTE Oversampling**

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Hitung target_size berdasarkan kelas mayoritas
target_size = max(Counter(y_train).values())

# Terapkan SMOTE untuk 4 kelas
smote = SMOTE(
    sampling_strategy={
        0: target_size,  # Normal
        1: target_size,  # Abusive saja
        2: target_size,  # HS saja
        3: target_size   # HS+Abusive
    },
    random_state=42,
    k_neighbors=min(10, min(Counter(y_train).values()))  # Adjust untuk kelas minoritas
)

X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
_, y_train_res_idx = smote.fit_resample(np.arange(len(X_train)).reshape(-1, 1), y_train)
X_train_res_text = X_train_text[y_train_res_idx.flatten()]

# Mapping label ke deskripsi
label_map = {
    0: "Normal (HS=0, Abusive=0)",
    1: "Abusive saja (HS=0, Abusive=1)",
    2: "HS saja (HS=1, Abusive=0)",
    3: "HS + Abusive (HS=1, Abusive=1)"
}

# Verifikasi distribusi setelah SMOTE
print("\nDistribusi Label Setelah SMOTE:")
counter_res = Counter(y_train_res)

for label in sorted(counter_res):
    print(f"{label} - {label_map[label]}: {counter_res[label]} sampel")


Distribusi Label Setelah SMOTE:
0 - Normal (HS=0, Abusive=0): 4688 sampel
1 - Abusive saja (HS=0, Abusive=1): 4688 sampel
2 - HS saja (HS=1, Abusive=0): 4688 sampel
3 - HS + Abusive (HS=1, Abusive=1): 4688 sampel


# **Inisialisasi Tokenizer**

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Inisialisasi Tokenizer
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Persiapan Dataset Class**

In [None]:
from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),       # Shape: [max_length]
            "attention_mask": encoding["attention_mask"].flatten(),  # Shape: [max_length]
            "labels": torch.tensor(label, dtype=torch.long),     # Shape: [1]
        }

# **Fungsi Evaluasi**

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Fungsi untuk menghitung metrik evaluasi
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'classification_report': classification_report(labels, preds, target_names=list(label_map.values()), output_dict=True)
    }

# **Persiapan Dataset**

In [None]:
# Prepare 100-sample datasets
X_train_res_text_100 = X_train_res_text[:100]
y_train_res_100 = y_train_res[:100]
X_train_text_100 = X_train_text[:100]
y_train_100 = y_train[:100]
X_test_text_100 = X_test_text[:100]
y_test_100 = y_test[:100]

train_dataset_smote_100 = TextDataset(X_train_res_text_100, y_train_res_100, tokenizer, 128)
train_dataset_original_100 = TextDataset(X_train_text_100, y_train_100, tokenizer, 128)
test_dataset_100 = TextDataset(X_test_text_100, y_test_100, tokenizer, 128)


# **Training Arguments**

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./bert_results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none"
)

# **Training IndoBERT**

In [None]:
from transformers import Trainer

# Initialize both models
model_smote = BertForSequenceClassification.from_pretrained(
    'indolem/indobert-base-uncased',
    num_labels=4,
    problem_type="single_label_classification"
)

model_original = BertForSequenceClassification.from_pretrained(
    'indolem/indobert-base-uncased',
    num_labels=4,
    problem_type="single_label_classification"
)

# Initialize Trainer for SMOTE model
trainer_smote = Trainer(
    model=model_smote,
    args=training_args,
    train_dataset=train_dataset_smote_100,
    eval_dataset=test_dataset_100,
    compute_metrics=compute_metrics
)

# Initialize Trainer for original model
trainer_original = Trainer(
    model=model_original,
    args=training_args,
    train_dataset=train_dataset_original_100,
    eval_dataset=test_dataset_100,
    compute_metrics=compute_metrics
)

# Train both models
print("Training SMOTE-augmented model...")
trainer_smote.train()

print("\nTraining original model...")
trainer_original.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training SMOTE-augmented model...


Epoch,Training Loss,Validation Loss,Accuracy,Classification Report
1,No log,1.541072,0.44,"{'Normal (HS=0, Abusive=0)': {'precision': 0.5068493150684932, 'recall': 0.7551020408163265, 'f1-score': 0.6065573770491803, 'support': 49.0}, 'Abusive saja (HS=0, Abusive=1)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12.0}, 'HS saja (HS=1, Abusive=0)': {'precision': 0.16666666666666666, 'recall': 0.13333333333333333, 'f1-score': 0.14814814814814814, 'support': 15.0}, 'HS + Abusive (HS=1, Abusive=1)': {'precision': 0.5, 'recall': 0.20833333333333334, 'f1-score': 0.29411764705882354, 'support': 24.0}, 'accuracy': 0.44, 'macro avg': {'precision': 0.29337899543379, 'recall': 0.27419217687074826, 'f1-score': 0.262205793064038, 'support': 100.0}, 'weighted avg': {'precision': 0.3933561643835617, 'recall': 0.44, 'f1-score': 0.3900235722704382, 'support': 100.0}}"
2,No log,2.215096,0.4,"{'Normal (HS=0, Abusive=0)': {'precision': 0.5142857142857142, 'recall': 0.7346938775510204, 'f1-score': 0.6050420168067226, 'support': 49.0}, 'Abusive saja (HS=0, Abusive=1)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12.0}, 'HS saja (HS=1, Abusive=0)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15.0}, 'HS + Abusive (HS=1, Abusive=1)': {'precision': 0.3076923076923077, 'recall': 0.16666666666666666, 'f1-score': 0.21621621621621623, 'support': 24.0}, 'accuracy': 0.4, 'macro avg': {'precision': 0.20549450549450549, 'recall': 0.22534013605442177, 'f1-score': 0.20531455825573472, 'support': 100.0}, 'weighted avg': {'precision': 0.32584615384615384, 'recall': 0.4, 'f1-score': 0.348362480127186, 'support': 100.0}}"
3,No log,2.43133,0.4,"{'Normal (HS=0, Abusive=0)': {'precision': 0.5, 'recall': 0.7346938775510204, 'f1-score': 0.5950413223140496, 'support': 49.0}, 'Abusive saja (HS=0, Abusive=1)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12.0}, 'HS saja (HS=1, Abusive=0)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15.0}, 'HS + Abusive (HS=1, Abusive=1)': {'precision': 0.3076923076923077, 'recall': 0.16666666666666666, 'f1-score': 0.21621621621621623, 'support': 24.0}, 'accuracy': 0.4, 'macro avg': {'precision': 0.20192307692307693, 'recall': 0.22534013605442177, 'f1-score': 0.20281438463256646, 'support': 100.0}, 'weighted avg': {'precision': 0.3188461538461539, 'recall': 0.4, 'f1-score': 0.3434621398257762, 'support': 100.0}}"



Training original model...


Epoch,Training Loss,Validation Loss,Accuracy,Classification Report
1,No log,1.383666,0.24,"{'Normal (HS=0, Abusive=0)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 49.0}, 'Abusive saja (HS=0, Abusive=1)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12.0}, 'HS saja (HS=1, Abusive=0)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15.0}, 'HS + Abusive (HS=1, Abusive=1)': {'precision': 0.24, 'recall': 1.0, 'f1-score': 0.3870967741935484, 'support': 24.0}, 'accuracy': 0.24, 'macro avg': {'precision': 0.06, 'recall': 0.25, 'f1-score': 0.0967741935483871, 'support': 100.0}, 'weighted avg': {'precision': 0.0576, 'recall': 0.24, 'f1-score': 0.0929032258064516, 'support': 100.0}}"
2,No log,1.238426,0.49,"{'Normal (HS=0, Abusive=0)': {'precision': 0.49, 'recall': 1.0, 'f1-score': 0.6577181208053692, 'support': 49.0}, 'Abusive saja (HS=0, Abusive=1)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12.0}, 'HS saja (HS=1, Abusive=0)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15.0}, 'HS + Abusive (HS=1, Abusive=1)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 24.0}, 'accuracy': 0.49, 'macro avg': {'precision': 0.1225, 'recall': 0.25, 'f1-score': 0.1644295302013423, 'support': 100.0}, 'weighted avg': {'precision': 0.24009999999999998, 'recall': 0.49, 'f1-score': 0.3222818791946309, 'support': 100.0}}"
3,No log,1.240399,0.49,"{'Normal (HS=0, Abusive=0)': {'precision': 0.49, 'recall': 1.0, 'f1-score': 0.6577181208053692, 'support': 49.0}, 'Abusive saja (HS=0, Abusive=1)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12.0}, 'HS saja (HS=1, Abusive=0)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15.0}, 'HS + Abusive (HS=1, Abusive=1)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 24.0}, 'accuracy': 0.49, 'macro avg': {'precision': 0.1225, 'recall': 0.25, 'f1-score': 0.1644295302013423, 'support': 100.0}, 'weighted avg': {'precision': 0.24009999999999998, 'recall': 0.49, 'f1-score': 0.3222818791946309, 'support': 100.0}}"


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=39, training_loss=1.3130061809833233, metrics={'train_runtime': 646.8996, 'train_samples_per_second': 0.464, 'train_steps_per_second': 0.06, 'total_flos': 19733683507200.0, 'train_loss': 1.3130061809833233, 'epoch': 3.0})

##**Evaluasi Model**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import torch

def evaluate_model(trainer, dataset, name="Model"):
    # Ambil prediksi dari trainer
    preds_output = trainer.predict(dataset)
    preds = np.argmax(preds_output.predictions, axis=1)
    labels = preds_output.label_ids

    # Print classification report
    print(f"\n=== Evaluation for {name} ===")
    print(classification_report(labels, preds, digits=2))

    # Print confusion matrix
    cm = confusion_matrix(labels, preds)
    print(f"Confusion Matrix for {name}:\n{cm}")

# Evaluasi model SMOTE
evaluate_model(trainer_smote, test_dataset_100, name="SMOTE Model")

# Evaluasi model original
evaluate_model(trainer_original, test_dataset_100, name="Original Model")

Epoch,Training Loss,Validation Loss



=== Evaluation for SMOTE Model ===
              precision    recall  f1-score   support

           0       0.51      0.76      0.61        49
           1       0.00      0.00      0.00        12
           2       0.17      0.13      0.15        15
           3       0.50      0.21      0.29        24

    accuracy                           0.44       100
   macro avg       0.29      0.27      0.26       100
weighted avg       0.39      0.44      0.39       100

Confusion Matrix for SMOTE Model:
[[37  2  7  3]
 [11  0  1  0]
 [ 9  2  2  2]
 [16  1  2  5]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== Evaluation for Original Model ===
              precision    recall  f1-score   support

           0       0.49      1.00      0.66        49
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00        15
           3       0.00      0.00      0.00        24

    accuracy                           0.49       100
   macro avg       0.12      0.25      0.16       100
weighted avg       0.24      0.49      0.32       100

Confusion Matrix for Original Model:
[[49  0  0  0]
 [12  0  0  0]
 [15  0  0  0]
 [24  0  0  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
