##install library

In [1]:
!pip install pandas scipy gensim tensorflow scikit-learn nltk imblearn emoji PySastrawi



## impor library yang dibutuhkan

In [2]:
import pandas as pd
import re
import emoji
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from gensim.models import Word2Vec
from Sastrawi.Stemmer import StemmerFactory
from imblearn.over_sampling import SMOTE
import joblib
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

slang_df = pd.read_csv('/content/kamus_slang.csv')
slang_dict = dict(zip(slang_df['slang'], slang_df['formal']))

##import csv


In [4]:
df = pd.read_csv('/content/hasil_ulasan_minecraft.csv')
print("Jumlah data:", len(df))
print(df.head())

Jumlah data: 10000
                               reviewId         userName  \
0  36773bdc-c113-44ce-9eaa-78070dc31a5e  Pengguna Google   
1  dbddaf78-3ed0-4fa2-a957-1f58f4ed09c7  Pengguna Google   
2  7d3a251d-a7fe-4752-92a3-85cd18a37c48  Pengguna Google   
3  600e6ed6-c92d-4fc2-b91a-34569cc0fea1  Pengguna Google   
4  8bef8613-43c8-4ff1-80ee-b5ffe1df2a66  Pengguna Google   

                                           userImage  \
0  https://play-lh.googleusercontent.com/EGemoI2N...   
1  https://play-lh.googleusercontent.com/EGemoI2N...   
2  https://play-lh.googleusercontent.com/EGemoI2N...   
3  https://play-lh.googleusercontent.com/EGemoI2N...   
4  https://play-lh.googleusercontent.com/EGemoI2N...   

                                             content  score  thumbsUpCount  \
0                m malas menjawab tapi game kesukaan      5              0   
1  game nya bagus banget karna ada shader nya tan...      5              0   
2                           mojang kamu islam buk

##labeling dan cleaning text

In [5]:
def label_sentiment(score):
    if score <= 2: return 'negatif'
    elif score == 3: return 'netral'
    else: return 'positif'

df['sentiment'] = df['score'].apply(label_sentiment)
print("Distribusi awal:\n", df['sentiment'].value_counts().to_string())

stop_words = set(stopwords.words('indonesian')) | {'dan', 'yang', 'di', 'ke', 'nya', 'ini', 'itu'}

def bersihin_text(text):
    text = str(text).lower()
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()
    text = ' '.join(slang_dict.get(word, word) for word in words if word not in stop_words or word in ['oke', 'bagus', 'top'])
    return stemmer.stem(text)

df['cleaned_content'] = df['content'].apply(bersihin_text)

tfidf = TfidfVectorizer(max_features=10000, stop_words=list(stop_words), ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['cleaned_content']).toarray()
y = pd.get_dummies(df['sentiment']).values
smote = SMOTE(random_state=42)
X_tfidf_smote, y_smote = smote.fit_resample(X_tfidf, np.argmax(y, axis=1))
y_smote = pd.get_dummies(y_smote).values
df_balanced = pd.DataFrame({'cleaned_content': [' '.join(doc) for doc in tfidf.inverse_transform(X_tfidf_smote)], 'sentiment': np.argmax(y_smote, axis=1)})
df_balanced['sentiment'] = df_balanced['sentiment'].map({0: 'negatif', 1: 'netral', 2: 'positif'})
print(f"Jumlah data setelah SMOTE: {len(df_balanced)}")
print("Distribusi setelah SMOTE:\n", df_balanced['sentiment'].value_counts().to_string())

def Evaluasi_Model(y_true, y_pred, set_name=""):
    accuracy = accuracy_score(y_true, y_pred)
    print(f"\nAkurasi {set_name}: {accuracy * 100:.2f}%")
    print(classification_report(y_true, y_pred, target_names=['negatif', 'netral', 'positif']))
    return accuracy

Distribusi awal:
 sentiment
positif    7494
negatif    1937
netral      569




Jumlah data setelah SMOTE: 22482
Distribusi setelah SMOTE:
 sentiment
positif    7494
negatif    7494
netral     7494


##define model

In [12]:
def train_dense_tfidf(X, y, test_size=0.2, epochs=25, batch_size=64, name="Dense + TF-IDF"):
    print(f"\n=== Skema: {name} (Test size={test_size}) ===")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(512, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.4),
        Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.4),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-5)

    model.fit(
        X_train, y_train,
        epochs=epochs, batch_size=batch_size,
        validation_split=0.1,
        callbacks=[early_stopping, lr_scheduler],
        verbose=1
    )

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)
    y_train_cat = np.argmax(y_train, axis=1)
    y_test_cat = np.argmax(y_test, axis=1)

    train_acc = Evaluasi_Model(y_train_cat, y_pred_train, "Training")
    test_acc = Evaluasi_Model(y_test_cat, y_pred_test, "Testing")

    return model, train_acc, test_acc


def train_lstm_word2vec(df, y, max_words=10000, embedding_dim=200, max_len=100, test_size=0.2, epochs=25, batch_size=64):
    print(f"\n=== Skema: LSTM + Word2Vec (Test size={test_size}) ===")

    sentences = [text.split() for text in df['cleaned_content']]
    w2v_model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4, epochs=20)

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(df['cleaned_content'])
    X_seq = tokenizer.texts_to_sequences(df['cleaned_content'])
    X_pad = pad_sequences(X_seq, maxlen=max_len)

    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i < max_words and word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]

    X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=test_size, random_state=42)

    model = Sequential([
        Embedding(max_words, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True),
        Bidirectional(LSTM(256, return_sequences=True, kernel_regularizer=l2(0.005))),
        LSTM(128),
        Dropout(0.5),
        Dense(128, activation='relu', kernel_regularizer=l2(0.005)),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-5)

    model.fit(
        X_train, y_train,
        epochs=epochs, batch_size=batch_size,
        validation_split=0.1,
        callbacks=[early_stopping, lr_scheduler],
        verbose=1
    )

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)
    y_train_cat = np.argmax(y_train, axis=1)
    y_test_cat = np.argmax(y_test, axis=1)

    train_acc = Evaluasi_Model(y_train_cat, y_pred_train, "Training")
    test_acc = Evaluasi_Model(y_test_cat, y_pred_test, "Testing")

    return model, train_acc, test_acc, tokenizer

##traning model

In [13]:
# Skema 1
model1, train_acc1, test_acc1 = train_dense_tfidf(X_tfidf_smote, y_smote, test_size=0.2, name="Dense + TF-IDF 80/20")

# Skema 2
model2, train_acc2, test_acc2, tokenizer = train_lstm_word2vec(df_balanced, y_smote, max_len=100)

# Skema 3
model3, train_acc3, test_acc3 = train_dense_tfidf(X_tfidf_smote, y_smote, test_size=0.3, name="Dense + TF-IDF 70/30")


=== Skema: Dense + TF-IDF 80/20 (Test size=0.2) ===
Epoch 1/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - accuracy: 0.5696 - loss: 1.3671 - val_accuracy: 0.8060 - val_loss: 0.7436 - learning_rate: 5.0000e-04
Epoch 2/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.8469 - loss: 0.6598 - val_accuracy: 0.8555 - val_loss: 0.6824 - learning_rate: 5.0000e-04
Epoch 3/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8910 - loss: 0.5706 - val_accuracy: 0.8416 - val_loss: 0.6593 - learning_rate: 5.0000e-04
Epoch 4/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9038 - loss: 0.5360 - val_accuracy: 0.8749 - val_loss: 0.6401 - learning_rate: 5.0000e-04
Epoch 5/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9162 - loss: 0.5135 - val_accuracy: 0.8788 - val_loss: 0.6243 - learning_rate: 5.0000e



[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 39ms/step - accuracy: 0.5691 - loss: 3.1051 - val_accuracy: 0.6504 - val_loss: 1.2379 - learning_rate: 5.0000e-04
Epoch 2/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 35ms/step - accuracy: 0.6646 - loss: 1.1431 - val_accuracy: 0.7115 - val_loss: 0.9077 - learning_rate: 5.0000e-04
Epoch 3/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 0.7295 - loss: 0.8652 - val_accuracy: 0.7543 - val_loss: 0.7763 - learning_rate: 5.0000e-04
Epoch 4/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 37ms/step - accuracy: 0.7943 - loss: 0.6996 - val_accuracy: 0.7966 - val_loss: 0.6699 - learning_rate: 5.0000e-04
Epoch 5/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.8350 - loss: 0.5878 - val_accuracy: 0.8266 - val_loss: 0.6030 - learning_rate: 5.0000e-04
Epoch 6/25
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [17]:
def nerawang_sentiment(text, model, vectorizer, is_word2vec=False, tokenizer=None, max_len=100):
    cleaned_text = bersihin_text(text)
    if is_word2vec:
        seq = tokenizer.texts_to_sequences([cleaned_text])
        padded = pad_sequences(seq, maxlen=max_len)
        pred = model.predict(padded)
    else:
        tfidf_vec = vectorizer.transform([cleaned_text]).toarray()
        pred = model.predict(tfidf_vec)
    sentiment = np.argmax(pred, axis=1)[0]
    return ['negatif', 'netral', 'positif'][sentiment]

sample_text = "minecraft game nya bisa membangun kreativitas"

print("\nContoh Inference:")
print(f"Skema 1 : {nerawang_sentiment(sample_text, model1, tfidf)}")
print(f"Skema 2 : {nerawang_sentiment(sample_text, model2, None, True, tokenizer)}")
print(f"Skema 3 : {nerawang_sentiment(sample_text, model3, tfidf)}")


Contoh Inference:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Skema 1 : positif
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Skema 2 : positif
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 607ms/step
Skema 3 : positif
