**LIBRARY YANG DIGUNAKAN**

In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
from collections import Counter

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


# imblearn optional
from imblearn.over_sampling import RandomOverSampler

# gensim for Word2Vec
from gensim.models import Word2Vec

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

# joblib
import joblib

# nltk for augmentation
import nltk
from nltk.corpus import wordnet

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)


**LOAD DATASET & PEMBERIAN 3 LABEL TERHADAP RATING**

In [2]:
df = pd.read_csv('tokopedia_reviews.csv')
print("raw shape:", df.shape)
df = df.dropna(subset=['review', 'rating']).copy()

df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df = df.dropna(subset=['rating'])

df['rating'] = df['rating'].astype(int)

# Map to 3 classes
def map_three(r):
    if r >= 4:
        return 'positive'
    elif r == 3:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['rating'].apply(map_three)

# Keep needed columns
df = df[['review', 'sentiment']].reset_index(drop=True)
print("after mapping shape:", df.shape)
print(df['sentiment'].value_counts())

raw shape: (3235, 4)
after mapping shape: (3235, 2)
sentiment
positive    3170
negative      37
neutral       28
Name: count, dtype: int64


**TEXT PREPROCESSING**

In [3]:
import string
def preprocess_text(s):
    s = str(s).lower()
    # remove URLs
    s = re.sub(r'http\S+|www\S+',' ', s)
    # remove non-alphanumeric (allow spaces)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df['clean_review'] = df['review'].apply(preprocess_text)

**Augmentation & ensure >=10k samples**

In [4]:
def synonym_replace(sentence, n_replace=1):
    words = sentence.split()
    if len(words) < 3:
        return sentence
    new_words = words.copy()
    for _ in range(n_replace):
        idx = random.randrange(len(words))
        w = words[idx]
        syns = []
        for syn in wordnet.synsets(w):
            for lemma in syn.lemmas():
                cand = lemma.name().replace('_',' ')
                if cand != w and cand.isalpha():
                    syns.append(cand)
        if syns:
            new_words[idx] = random.choice(syns)
    return ' '.join(new_words)

def augment_dataset(df, target_n=10000):
    df_aug = df.copy()
    current = len(df_aug)
    i = 0
    rows = df_aug.to_dict('records')
    while current < target_n and i < len(rows):
        row = rows[i]
        new_text = synonym_replace(row['clean_review'], n_replace=1)
        if new_text != row['clean_review']:
            rows.append({'review': row.get('review',''), 'sentiment': row['sentiment'], 'clean_review': new_text})
            current += 1
        i += 1
        if i >= len(rows): break
    return pd.DataFrame(rows)

# contoh penggunaan:
if len(df) < 10000:
    print("Dataset < 10000, augmenting (simple) to 10000 (may be noisy). Current:", len(df))
    df = augment_dataset(df, target_n=10000)
    print("After augment:", len(df))


Dataset < 10000, augmenting (simple) to 10000 (may be noisy). Current: 3235
After augment: 10000


**Ensure class balance**

In [5]:
print("Class distribution before:", Counter(df['sentiment']))
# If imbalance, use RandomOverSampler
ros = RandomOverSampler(random_state=RANDOM_STATE)
X_ros = df[['clean_review']].values
y_ros = df['sentiment'].values
X_res, y_res = ros.fit_resample(X_ros, y_ros)
df_bal = pd.DataFrame({'clean_review': X_res.flatten(), 'sentiment': y_res})
print("Class distribution after:", Counter(df_bal['sentiment']))
df = df_bal  # use balanced df going forward (optional, recommended)

Class distribution before: Counter({'positive': 9462, 'negative': 493, 'neutral': 45})
Class distribution after: Counter({'positive': 9462, 'neutral': 9462, 'negative': 9462})


**Encode labels**

In [6]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])  # e.g. negative->0, neutral->1, positive->2
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

Label mapping: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}


**Experiment A — LSTM with Word2Vec embeddings (80/20)**

In [None]:
# Prepare data
texts = df['clean_review'].tolist()
labels = df['label'].values

# Split 80/20
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=RANDOM_STATE, stratify=labels)

# Train Word2Vec
sentences = [t.split() for t in X_train_texts]
w2v_size = 200
w2v_window = 5
w2v_min_count = 1
w2v_model = Word2Vec(sentences=sentences, vector_size=w2v_size, window=w2v_window, min_count=w2v_min_count, workers=4, seed=RANDOM_STATE)
# save gensim model optionally
os.makedirs('models', exist_ok=True)
w2v_model.save('models/w2v.model')

# Tokenizer + sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_texts)
vocab_size = len(tokenizer.word_index) + 1
print("Vocab size:", vocab_size)
max_len = 100  # adjust if needed

X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_test_seq = tokenizer.texts_to_sequences(X_test_texts)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Build embedding matrix from w2v
embedding_matrix = np.zeros((vocab_size, w2v_size))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
    else:
        embedding_matrix[i] = np.random.normal(size=(w2v_size,))  # random init for OOV

# Build LSTM model
num_classes = len(le.classes_)
embedding_dim = w2v_size

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                    input_length=max_len, trainable=False))  # set trainable True/False as experiment
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Callbacks
os.makedirs('models/lstm', exist_ok=True)
checkpoint_path = 'models/lstm/best_lstm.h5'
callbacks = [
    EarlyStopping(monitor='val_accuracy', patience=4, mode='max', restore_best_weights=True),
    ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, mode='max')
]

# Train
history = model.fit(X_train_pad, y_train, validation_split=0.1, epochs=20, batch_size=128, callbacks=callbacks)

# Evaluate
train_loss, train_acc = model.evaluate(X_train_pad, y_train, verbose=0)
test_loss, test_acc = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"LSTM Train acc: {train_acc:.4f} | Test acc: {test_acc:.4f}")

# Save tokenizer and model
joblib.dump(tokenizer, "models/tokenizer.joblib")
model.save('models/lstm_final.h5')

# Predictions & report
y_test_pred_prob = model.predict(X_test_pad)
y_test_pred = np.argmax(y_test_pred_prob, axis=1)
print("Classification report (LSTM):")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))
print("Confusion matrix:\n", confusion_matrix(y_test, y_test_pred))


Vocab size: 5891




Epoch 1/20


**Experiment B — SVM + TF-IDF (80/20)**

In [None]:
# Prepare TF-IDF
X = df['clean_review'].values
y = df['label'].values
X_train, X_test, y_train_svm, y_test_svm = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=30000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

svm = SVC(kernel='linear', C=1.0, class_weight='balanced', random_state=RANDOM_STATE)
svm.fit(X_train_tfidf, y_train_svm)

# Evaluate
y_train_pred_svm = svm.predict(X_train_tfidf)
y_test_pred_svm = svm.predict(X_test_tfidf)
print("SVM train acc:", accuracy_score(y_train_svm, y_train_pred_svm))
print("SVM test acc :", accuracy_score(y_test_svm, y_test_pred_svm))
print(classification_report(y_test_svm, y_test_pred_svm, target_names=le.classes_))

# Save model + vectorizer
joblib.dump(svm, "models/svm_tfidf.joblib")
joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")

SVM train acc: 0.9925136515765369
SVM test acc : 0.9906657273687919
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      1892
     neutral       1.00      0.97      0.99      1893
    positive       0.97      1.00      0.99      1893

    accuracy                           0.99      5678
   macro avg       0.99      0.99      0.99      5678
weighted avg       0.99      0.99      0.99      5678



['models/tfidf_vectorizer.joblib']

**Experiment C — Random Forest + TF-IDF (70/30)**

In [None]:
# 70/30 split
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y)

tfidf2 = TfidfVectorizer(ngram_range=(1,2), max_features=35000)
X_train_rf_tfidf = tfidf2.fit_transform(X_train_rf)
X_test_rf_tfidf = tfidf2.transform(X_test_rf)

rf = RandomForestClassifier(n_estimators=500, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train_rf_tfidf, y_train_rf)

# Evaluate
y_train_pred_rf = rf.predict(X_train_rf_tfidf)
y_test_pred_rf = rf.predict(X_test_rf_tfidf)
print("RF train acc:", accuracy_score(y_train_rf, y_train_pred_rf))
print("RF test acc :", accuracy_score(y_test_rf, y_test_pred_rf))
print(classification_report(y_test_rf, y_test_pred_rf, target_names=le.classes_))

# Save
joblib.dump(rf, "models/rf_tfidf.joblib")
joblib.dump(tfidf2, "models/tfidf_vectorizer_rf.joblib")


RF train acc: 0.9990437845998994
RF test acc : 0.9914279004227337
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      2838
     neutral       0.97      1.00      0.99      2839
    positive       1.00      0.97      0.99      2839

    accuracy                           0.99      8516
   macro avg       0.99      0.99      0.99      8516
weighted avg       0.99      0.99      0.99      8516



['models/tfidf_vectorizer_rf.joblib']

**Inference example (demonstrasi & bukti)**

In [None]:
# Example inference function for LSTM (load saved tokenizer and model)


def inference_lstm(texts, model_path='models/lstm_final.h5', tokenizer_path='models/tokenizer.joblib', max_len=100):
    tok = joblib.load(tokenizer_path)
    model = load_model(model_path)
    seq = tok.texts_to_sequences([preprocess_text(t) for t in texts])
    pad = pad_sequences(seq, maxlen=max_len, padding='post')
    probs = model.predict(pad)
    preds = np.argmax(probs, axis=1)
    labels = le.inverse_transform(preds)
    return labels, probs

# Example usage
sample_texts = [
    "Barang sesuai deskripsi, cepat sampai dan berkualitas",
    "Produk rusak dan tidak ada garansi",
    "Pengiriman agak lama, produknya lumayan"
]
labels, probs = inference_lstm(sample_texts)
for t, l, p in zip(sample_texts, labels, probs):
    print("TEXT:", t)
    print("PRED:", l, "PROBS:", p)
    print("---")

# Example inference for SVM
def inference_svm(texts, model_path="models/svm_tfidf.joblib", vec_path="models/tfidf_vectorizer.joblib"):
    clf = joblib.load(model_path)
    vec = joblib.load(vec_path)
    Xv = vec.transform([preprocess_text(t) for t in texts])
    preds = clf.predict(Xv)
    labels = le.inverse_transform(preds)
    return labels

print(inference_svm(sample_texts))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step
TEXT: Barang sesuai deskripsi, cepat sampai dan berkualitas
PRED: positive PROBS: [4.2448445e-08 3.5914331e-11 1.0000000e+00]
---
TEXT: Produk rusak dan tidak ada garansi
PRED: positive PROBS: [1.4217164e-03 4.1488394e-05 9.9853683e-01]
---
TEXT: Pengiriman agak lama, produknya lumayan
PRED: positive PROBS: [1.0224595e-04 7.9046617e-07 9.9989700e-01]
---
['positive' 'positive' 'positive']
