In [None]:
!pip install -q transformers tensorflow pandas scikit-learn

In [27]:
import pandas as pd
import tensorflow as tf
import re
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

# Setup keys
MODEL_NAME = "indobenchmark/indobert-base-p1"
MAX_LEN = 64  # Good length for tweets/chats
BATCH_SIZE = 16
LEARNING_RATE = 5e-5

In [28]:
df = pd.read_csv('Twitter_Emotion_Dataset.csv')

# Load the slang dictionary
# We use sep=';' because your file uses semicolons
slang_df = pd.read_csv('kamus_singkatan.csv', sep=';', header=None, names=['slang', 'formal'])
slang_dict = dict(zip(slang_df['slang'], slang_df['formal']))

# --- 2. Define Custom Fixes & Cleaning Function ---
# Add your specific emotion overrides to the dictionary
custom_fixes = {
    "gila": "keren",    # Force 'gila' -> 'keren' (positive)
    "parah": "banget",  # Force 'parah' -> 'banget' (intensifier)
    "wkwk": "",         # Remove laughing noise
    "wkwkwk": "",
    "ga": "tidak",
    "gk": "tidak",
    "tdk": "tidak"
}
slang_dict.update(custom_fixes)

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove [USERNAME] and [URL]
    text = re.sub(r'\[username\]|\[url\]', '', text)
    # Remove special chars (keep letters and spaces)
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Slang Normalization
    words = text.split()
    mapped_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(mapped_words)

# --- 3. Map Labels ---
label_map = {
    'love': 0, 'joy': 1, 'anger': 2, 'sadness': 3, 'fear': 4
}
df['label'] = df['label'].map(label_map)
df = df.dropna()

# --- 4. Add Manual Augmentation (Teaching the model "Parah" = Good) ---
slang_correction = pd.DataFrame({
    'tweet': [
        "Gila keren banget", 
        "Parah seru abis", 
        "Gokil parah nih game", 
        "Mantap jiwa",
        "Enak banget parah rasanya",
        "Gila sih ini game seru banget parah"
    ],
    'label': [1, 1, 1, 1, 1, 1] # 1 = Joy
})

# Add these new examples 5 times
for _ in range(5):
    df = pd.concat([df, slang_correction], ignore_index=True)

# --- 5. Apply Cleaning to EVERYTHING ---
# We do this last so both your original data AND your manual examples get cleaned
print("Cleaning data... (this might take a moment)")
df['text_clean'] = df['tweet'].apply(clean_text)

# --- 6. Split into Training (80%) and Validation (20%) ---
# Note: We are now using 'text_clean' instead of 'tweet'
X_train, X_val, y_train, y_val = train_test_split(
    df['text_clean'].values, 
    df['label'].values, 
    test_size=0.2, 
    random_state=42
)

print("-" * 30)
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Example Cleaned Text: {X_train[0]}")

Cleaning data... (this might take a moment)
------------------------------
Training samples: 2731
Validation samples: 683
Example Cleaned Text: siapa sih di dunia yang  tidak punya hater rasul yang  mulia saja  punya budha saja  punya nabi isa saja  punya nah apalagi eloh sama  gueh ya kaaan


## Pre-Process Data

In [29]:
# 1. Load the dataset
# Ensure the filename matches your downloaded file
df = pd.read_csv('Twitter_Emotion_Dataset.csv')


# 2. Map text labels to numbers
# We create a dictionary to swap words for numbers
label_map = {
    'love': 0,
    'joy': 1,
    'anger': 2,
    'sadness': 3,
    'fear': 4
}

# Apply the mapping
df['label'] = df['label'].map(label_map)

# 3. Drop any rows that failed to map (just in case)
df = df.dropna()

# Create specific examples to teach the model that "Parah/Gila" = GOOD
slang_correction = pd.DataFrame({
    'tweet': [
        "Gila keren banget", 
        "Parah seru abis", 
        "Gokil parah nih game", 
        "Mantap jiwa",
        "Enak banget parah rasanya",
        "Gila sih ini game seru banget parah" # Your exact sentence!
    ],
    'label': [1, 1, 1, 1, 1, 1] # 1 corresponds to 'joy' in your map
})

# Add these new examples to the dataset 5 times to make sure it remembers
for _ in range(5):
    df = pd.concat([df, slang_correction], ignore_index=True)

# 4. Split into Training (80%) and Validation (20%)
X_train, X_val, y_train, y_val = train_test_split(
    df['tweet'].values, 
    df['label'].values, 
    test_size=0.2, 
    random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

Training samples: 2731
Validation samples: 683


## Tokenization

In [30]:
# Load the tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_data(texts, labels):
    # This function handles the tokenization
    encodings = tokenizer(
        texts.tolist(), 
        truncation=True, 
        padding=True, 
        max_length=MAX_LEN, 
        return_tensors="tf" # Return TensorFlow tensors
    )
    
    # Create a proper TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings), # Input features (input_ids, attention_mask)
        labels           # The target answers
    ))
    
    return dataset

# Convert our train and validation text into TF datasets
train_dataset = tokenize_data(X_train, y_train)
val_dataset = tokenize_data(X_val, y_val)

# Shuffle and batch the data for training
train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

In [None]:
!pip install tf-keras

## Donwload Model

In [20]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=5
)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  124441344 
                                                                 
 dropout_75 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 124445189 (474.72 MB)
Trainable params: 124445189 (474.72 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Training

In [None]:
# Define the "Referee" that stops the training if it gets worse
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',     # Watch the validation loss
    patience=1,             # If it doesn't improve for 1 epoch...
    restore_best_weights=True # ...STOP and go back to the best version
)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,               
    callbacks=[early_stop]  
)

Epoch 1/5
Epoch 2/5


### Simple Test

In [40]:
import numpy as np

def predict_emotion(text):
    # 1. Tokenize the new text
    inputs = tokenizer(text, return_tensors="tf", truncation=True, max_length=MAX_LEN)
    
    # 2. Get prediction
    logits = model(inputs).logits
    
    # 3. Find the highest score
    predicted_class_id = np.argmax(logits, axis=1)[0]
    
    # 4. Map number back to word
    # Invert the map we made earlier
    id_to_label = {v: k for k, v in label_map.items()}
    return id_to_label[predicted_class_id]

# --- TEST AREA ---
my_chat = "bahlil mukanya kayak monyet"
print(f"Chat: {my_chat}")
print(f"Predicted Emotion: {predict_emotion(my_chat)}")

Chat: bahlil mukanya kayak monyet
Predicted Emotion: anger
