# Task 2: Slang Interpretation

## 1. Import Libraries

In [1]:
!pip install evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=3b45c6e59ed74fc50242942019f098306526e5fe9cc6fcf5671d72997d4dffb5
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.6 rouge_score-0.1.2


In [2]:
import pandas as pd
import random
import evaluate
import torch
from tqdm import tqdm
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model, TaskType

## 2. Load Source Dataset

In [3]:
url = "https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv"
df = pd.read_csv(url)
df_clean = df.dropna(subset=['formal', 'context', 'category1'])

In [4]:
template_prompt = [
    "Kata '{slang}' adalah bentuk tidak baku atau istilah gaul yang bermakna '{formal}'.",
    "Istilah gaul '{slang}' memiliki arti formal yaitu '{formal}'.",
    "'{slang}' merupakan kata tidak baku yang padanan formalnya adalah '{formal}'.",
    "Dalam bahasa baku, '{slang}' memiliki makna '{formal}'.",
    "'{slang}' adalah istilah informal yang merujuk pada arti '{formal}'.",
    "Arti formal dari kata gaul '{slang}' adalah '{formal}'.",
    "'{slang}' digunakan sebagai bahasa tidak baku yang berarti '{formal}'.",
    "Dalam penggunaan sehari-hari, '{slang}' bermakna '{formal}'."
]

def construct_pairs(row):
    slang = row['slang']
    formal = row['formal']

    input_text = f"definisi slang: {slang}"
    target_text = random.choice(template_prompt).format(slang=slang, formal=formal)

    return pd.Series([input_text, target_text], index=['input_text', 'target_text'])

dataset_df = df_clean.apply(construct_pairs, axis=1)
train_df, test_df = train_test_split(dataset_df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

print(f"Jumlah Data Training: {len(train_dataset)}")
print(f"Jumlah Data Test    : {len(test_dataset)}")
print("\n=== CONTOH DATA (Check Format) ===")
print(f"INPUT  : {train_dataset[0]['input_text']}")
print(f"TARGET : {train_dataset[0]['target_text']}")

Jumlah Data Training: 13505
Jumlah Data Test    : 1501

=== CONTOH DATA (Check Format) ===
INPUT  : definisi slang: biza
TARGET : 'biza' adalah istilah informal yang merujuk pada arti 'bisa'.


## 3. Construct Dataset

### 3.1. Function for Pragmatics Analysis

In [5]:
def get_morphological_nuance(category, slang_word):
    cat = str(category).lower()

    if 'elongasi' in cat:
        return "Kata ini ditulis dengan pemanjangan huruf untuk mengekspresikan penekanan, antusiasme, atau nada yang lebih santai."
    elif 'abreviasi' in cat or 'akronim' in cat:
        return "Merupakan bentuk singkatan untuk efisiensi percakapan teks."
    elif 'zeroisasi' in cat:
        return "Merupakan bentuk pendek dengan menghilangkan huruf tertentu agar lebih cepat diketik."
    elif 'modifikasi vokal' in cat:
        return "Menggunakan perubahan vokal untuk memberikan kesan lebih akrab atau 'imut'."
    return ""

def get_contextual_nuance(context_sentence, slang_word):
    ctx = str(context_sentence).lower()
    slang = str(slang_word).lower()
    tokens = set(ctx.split())
    nuances = []

    laugh_terms = ['wkwk', 'haha', 'hihi', 'lol', 'ngakak', 'xi', 'awok', 'kocak', 'gokil', 'lucu']
    if any(x in ctx for x in laugh_terms):
        nuances.append("digunakan dalam konteks bercanda atau humor")

    intensifiers = ['banget', 'bgt', 'parah', 'sumpah', 'abis', 'bet', 'beud', 'bingit']
    if any(x in ctx for x in intensifiers) or '!!' in ctx or (ctx.count('!') > 1):
        nuances.append("digunakan untuk mengekspresikan emosi atau penekanan yang kuat")

    if '?' in ctx or 'gimana' in ctx or 'kok' in ctx or 'apa' in ctx:
        nuances.append("sering muncul dalam kalimat tanya atau ungkapan kebingungan")

    vocatives = {'kak', 'ka', 'kk', 'sis', 'gan', 'bro', 'bang', 'pak', 'bu', 'cuy', 'ngab', 'bestie', 'min', 'mimin', 'om', 'tante'}
    if not tokens.isdisjoint(vocatives):
        nuances.append("biasanya disertai sapaan akrab")

    cute_terms = ['maacih', 'acuu', 'kamuuh', 'unch', 'gemes', 'emesh', 'syantik', 'ucul']
    if slang in cute_terms or any(x in ctx for x in cute_terms) or 'uu' in slang:
        nuances.append("memiliki nuansa manja, imut, atau ungkapan gemas")

    bad_words = ['anjing', 'njir', 'anjir', 'bangsat', 'bego', 'goblok', 'tolol', 'tai', 'kampungan', 'bangke']
    if slang in bad_words or any(x in ctx for x in bad_words):
        nuances.append("merupakan kata kasar yang digunakan untuk umpatan atau kekesalan")

    invites = ['kuy', 'yok', 'yuk', 'yuuk', 'cus', 'gas', 'skuy']
    if slang in invites or any(x in ctx for x in invites):
        nuances.append("digunakan sebagai kata ajakan")

    prayers = ['amin', 'aamiin', 'aminn', 'moga', 'smga', 'semoga']
    if slang in prayers or any(x in ctx for x in prayers):
        nuances.append("digunakan dalam konteks doa atau harapan")

    if nuances:
        return "Istilah ini " + ", dan ".join(nuances) + "."
    return ""

### 3.2. Construct for new rich dataset (add morphological & contextual nuance)

In [6]:
def construct_rich_dataset(row):
    slang = row['slang']
    formal = row['formal']
    context = row['context']
    category = row['category1']

    input_text = f"jelaskan makna slang: {slang} | konteks: {context}"
    core_templates = [
        f"'{slang}' adalah bentuk gaul dari kata '{formal}'.",
        f"Istilah '{slang}' memiliki arti '{formal}'.",
        f"Dalam bahasa baku, '{slang}' berarti '{formal}'."
    ]
    definition = random.choice(core_templates)

    morph_note = get_morphological_nuance(category, slang)
    if morph_note:
        definition += f" {morph_note}"

    context_note = get_contextual_nuance(context, slang)
    if context_note:
        definition += f" {context_note}"

    return pd.Series([input_text, definition], index=['input_text', 'target_text'])

rich_dataset = df_clean.apply(construct_rich_dataset, axis=1)

In [7]:
print("\n=== 5 Sampel Acak ===")
pd.set_option('display.max_colwidth', None)
print(rich_dataset.sample(5).to_string(index=False))


=== 5 Sampel Acak ===
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     input_text                                                                                                                                                                                                                                                                                                                                                        target_text
                                                                                                              

## 4. Split Dataset

In [8]:
df_final = rich_dataset[['input_text', 'target_text']].copy()

train_df, test_df = train_test_split(df_final, test_size=0.1, random_state=42)
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

print(f"Training Samples: {len(train_ds)} | Validation Samples: {len(test_ds)}")

Training Samples: 13505 | Validation Samples: 1501


## 5. Modelling & Training

### 5.1. Model and Tokenizer Configuration

In [9]:
MODEL_NAME = "google-t5/t5-base"
MAX_LEN = 128

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 10,027,008 || all params: 232,930,560 || trainable%: 4.3047


### 5.2. Preprocessing

In [10]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=MAX_LEN, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=MAX_LEN, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

Map:   0%|          | 0/13505 [00:00<?, ? examples/s]



Map:   0%|          | 0/1501 [00:00<?, ? examples/s]

### 5.3. Training

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./indo-slang-interpret-v1",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_steps=50,
    predict_with_generate=True,
    fp16=False,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,0.082,0.070756
2,0.0969,0.057379
3,0.0518,0.044873
4,0.0436,0.036011
5,0.0326,0.033017


TrainOutput(global_step=8445, training_loss=0.07836769540469417, metrics={'train_runtime': 3529.5305, 'train_samples_per_second': 19.131, 'train_steps_per_second': 2.393, 'total_flos': 1.0717103975768064e+16, 'train_loss': 0.07836769540469417, 'epoch': 5.0})

### 5.4. Save Model

In [12]:
final_model_path = "./slang_interpret_model_v1"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Model berhasil disimpan di folder: {final_model_path}")

Model berhasil disimpan di folder: ./slang_interpret_model_v1


## 6. Inference

### 6.1. Load Model

In [13]:
model_path = "./slang_interpret_model_v1"
print("Loading Model...")

config = PeftConfig.from_pretrained(model_path)
base_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = PeftModel.from_pretrained(base_model, model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

print("Model Siap Digunakan!")

Loading Model...
Model Siap Digunakan!


### 6.2. Inference & Test

In [14]:
def predict_slang(slang, context):
    input_text = f"jelaskan makna slang: {slang} | konteks: {context}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_length=128,
            num_beams=4,
            repetition_penalty=1.2,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\n=== UJI COBA MODEL ===")
test_cases = [
    ("mager", "duh ujan deres banget jadi mager parah mau kuliah"),
    ("baper", "dihh gtu aja lu udah baper"),
    ("bat", "wuih keren bat motor lu ngab"),
    ("ngab", "wuih keren bat motor lu ngab"),
    ("ngakak", "sumpah jokes lu receh banget bikin ngakak wkwk"),
    ("gercep", "info lomba dong kak, harus gercep nih"),
    ("utk", "ini krim bagus utk wajah"),
    ("njirr", "njirr lo mau nipu gue yak!"),
    ("gokil", "gila gokil banget performanya!")
]

for slang, ctx in test_cases:
    print(f"Slang   : {slang}")
    print(f"Konteks : {ctx}")
    print(f"Makna   : {predict_slang(slang, ctx)}")
    print("-" * 50)


=== UJI COBA MODEL ===
Slang   : mager
Konteks : duh ujan deres banget jadi mager parah mau kuliah
Makna   : 'mager' adalah bentuk gaul dari kata 'mager'. Menggunakan perubahan vokal untuk memberikan kesan lebih akrab atau 'imut'. Istilah ini digunakan untuk mengekspresikan emosi atau penekanan yang kuat.
--------------------------------------------------
Slang   : baper
Konteks : dihh gtu aja lu udah baper
Makna   : Istilah 'baper' memiliki arti 'bawa perasaan'. Merupakan bentuk singkatan untuk efisiensi percakapan teks.
--------------------------------------------------
Slang   : bat
Konteks : wuih keren bat motor lu ngab
Makna   : Dalam bahasa baku, 'bat' berarti 'banget'. Merupakan bentuk singkatan untuk efisiensi percakapan teks.
--------------------------------------------------
Slang   : ngab
Konteks : wuih keren bat motor lu ngab
Makna   : 'ngab' adalah bentuk gaul dari kata 'mengab'.
--------------------------------------------------
Slang   : ngakak
Konteks : sumpah jokes lu

## 7. Evaluation

### 7.1. Setup Evaluation

In [15]:
rouge = evaluate.load("rouge")
batch_size = 8

test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
model.eval()

Downloading builder script: 0.00B [00:00, ?B/s]

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=32, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=32, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
            

### 7.2. Generation Loop

In [16]:
predictions = []
references = []
inputs = []

for batch in tqdm(test_dataloader):
    input_texts = batch['input_text']
    target_texts = batch['target_text']

    model_inputs = tokenizer(
        input_texts,
        max_length=128,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        generated_tokens = model.generate(
            input_ids=model_inputs["input_ids"],
            attention_mask=model_inputs["attention_mask"],
            max_length=128,
            num_beams=4,
            repetition_penalty=1.2,
            early_stopping=True
        )

    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    decoded_labels = target_texts

    predictions.extend(decoded_preds)
    references.extend(decoded_labels)
    inputs.extend(input_texts)

100%|██████████| 188/188 [09:52<00:00,  3.15s/it]


### 7.3. Scoring (ROUGE) & Save Score

In [17]:
print("\nMenghitung Metrik ROUGE...")
results = rouge.compute(predictions=predictions, references=references)

print(f"ROUGE-1: {results['rouge1']*100:.2f}")
print(f"ROUGE-2: {results['rouge2']*100:.2f}")
print(f"ROUGE-L: {results['rougeL']*100:.2f}")

df_results = pd.DataFrame({
    'Input': inputs,
    'Target (Gold)': references,
    'Prediction (Model)': predictions
})

filename = "evaluasi_slang_model.csv"
df_results.to_csv(filename, index=False)
print(f"\nHasil detail disimpan ke '{filename}'.")

print("\n=== SAMPEL HASIL EVALUASI ===")
pd.set_option('display.max_colwidth', None)
print(df_results.sample(5).to_string(index=False))


Menghitung Metrik ROUGE...
ROUGE-1: 29.29
ROUGE-2: 5.71
ROUGE-L: 23.55

Hasil detail disimpan ke 'evaluasi_slang_model.csv'.

=== SAMPEL HASIL EVALUASI ===
                 Input                                                                      Target (Gold)                                                                                      Prediction (Model)
definisi slang: liat'y Kata 'liat'y' adalah bentuk tidak baku atau istilah gaul yang bermakna 'lihatnya'.                                                          'liat'y' adalah bentuk gaul dari kata 'lihat'.
   definisi slang: rmh       Kata 'rmh' adalah bentuk tidak baku atau istilah gaul yang bermakna 'rumah'. 'rmh' adalah bentuk gaul dari kata 'rumah'. Merupakan bentuk singkatan untuk efisiensi percakapan teks.
   definisi slang: dri              'dri' merupakan kata tidak baku yang padanan formalnya adalah 'dari'.  'dri' adalah bentuk gaul dari kata 'dari'. Merupakan bentuk singkatan untuk efisiensi percakapan teks.
   

In [19]:
# import shutil
# import os

# output_filename = "slang_interpret_model_v1"
# directory_to_zip = "/content/slang_interpret_model_v1"

# # Create a zip archive
# shutil.make_archive(output_filename, 'zip', directory_to_zip)

# print(f"Folder '{directory_to_zip}' has been successfully zipped to '{output_filename}.zip'")