In [None]:
import pandas as pd
import numpy as np

In [None]:
anger = pd.read_csv("/kaggle/input/main-dataset0-1/AngerData.csv", delimiter="\t", encoding="utf-8")
fear = pd.read_csv("/kaggle/input/main-dataset0-1/FearData.csv", delimiter="\t", encoding="utf-8")
joy = pd.read_csv("/kaggle/input/main-dataset0-1/JoyData.csv", delimiter="\t", encoding="utf-8")
love = pd.read_csv("/kaggle/input/main-dataset0-1/LoveData.csv", delimiter="\t", encoding="utf-8")
neutral = pd.read_csv("/kaggle/input/main-dataset0-1/NeutralData.csv", delimiter="\t", encoding="utf-8")
sad = pd.read_csv("/kaggle/input/main-dataset0-1/SadData.csv", delimiter="\t", encoding="utf-8")

In [None]:
print("Anger Dataset information")
anger.info()

print("Fear Dataset information")
fear.info()

print("Joy Dataset information")
joy.info()

print("Love Dataset information")
love.info()

print("Neutral Dataset information")
neutral.info()

print("Sad Dataset information")
sad.info()

In [None]:
anger["Label"] = "Anger"
fear["Label"] = "Fear"
joy["Label"] = "Joy"
love["Label"] = "Love"
neutral["Label"] = "Neutral"
sad["Label"] = "Sad"

In [None]:
df = pd.concat([anger, fear, joy, love, neutral, sad], ignore_index=True)

In [None]:
print(df)

In [None]:
print(df['Label'].value_counts())

In [None]:
print(df.info())

# PREPROCESS

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
# nltk.download('stopwords')
# nltk.download('punkt')

In [None]:
stop_words = set(stopwords.words('indonesian')) 
stemmer = PorterStemmer()

In [None]:
singkatan_dict = {
    "gpp": "gak apa apa",
    "yg": "yang",
    "abg": "abang",
    "bg": "bang",
    "bgt": "banget",
    "thn": "tahun",
    "klo": "kalo",
    "kl": "kalo",
    "tdk": "tidak",
    "bgt": "banget",
    "krn": "karena",
    "udh": "sudah",
    "dgn": "dengan",
    "trs": "terus",
    "blm": "belum",
    "sm": "sama",
    "aja": "saja"
}

In [None]:
def expand_singkatan(text):
    words = text.split()
    words = [singkatan_dict[word] if word in singkatan_dict else word for word in words]
    return " ".join(words)

In [None]:
df["Tweet"] = df["Tweet"].apply(expand_singkatan)

In [None]:
def preprocess_text(text):
    text = expand_singkatan(text)  # Perbaiki singkatan dulu
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Hapus angka & tanda baca
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stop_words]  # Hapus stopwords
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    return " ".join(tokens)

In [None]:
# Terapkan ke dataset
df["Clean_Tweet"] = df["Tweet"].apply(preprocess_text)

In [None]:
print(df.head()[["Tweet", "Clean_Tweet"]])

# LSTM/GRU

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Encode label (kategori emosi)
label_encoder = LabelEncoder()
df["Label"] = label_encoder.fit_transform(df["Label"])

In [None]:
# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(df["Clean_Tweet"], df["Label"], test_size=0.2, random_state=42)

In [None]:
# Tokenisasi teks
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [None]:
# Padding
max_len = 128
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding="post", truncating="post")

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Bidirectional, Dropout

In [None]:
# Model LSTM + GRU yang lebih kuat
model = Sequential([
    Embedding(input_dim=10000, output_dim=256),  # Naikkan output_dim biar embedding lebih kaya
    Bidirectional(LSTM(128, return_sequences=True)),  # Tambah units biar lebih kompleks
    Dropout(0.3),  # Tambah dropout buat regularisasi
    Bidirectional(GRU(64)),  # Tambah kapasitas GRU
    Dropout(0.3),
    Dense(64, activation="relu"),  # Tambah hidden layer tambahan
    Dense(6, activation="softmax")
])

In [None]:
optimizer = tf.keras.optimizers.AdamW(learning_rate=0.001)

# Compile ulang model
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
model.summary()

In [None]:
# Training
history = model.fit(train_padded, train_labels, epochs=20, batch_size=64, validation_data=(test_padded, test_labels))

In [None]:
# Simpan model
# model.save("mood_classification_lstm_gru.h5")

In [None]:
# Evaluasi model
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print(f"Test Accuracy: {test_acc:.4f}")