In [None]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack, csr_matrix

In [None]:
# ------------- data (sama seperti yang dipakai sebelumnya) -------------
train_data = {
    'review': [
        'Kameranya bagus banget, hasilnya jernih!',
        'Baterai awet seharian, performa kencang.',
        'Suka sama desainnya, premium dan elegan.',
        'Pengiriman cepat, packing aman, produk original.',
        'Mantap, sesuai ekspektasi. Tidak mengecewakan.',
        'Baru dipakai sebentar sudah panas, kecewa.',
        'Layar sering tidak merespon, sangat mengganggu.',
        'Barangnya rusak pas sampai, pengemasannya buruk.',
        'Kualitas suara speakernya jelek sekali.',
        'Deskripsi tidak sesuai, banyak fitur yang hilang.'
    ],
    'sentiment': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
}
df = pd.DataFrame(train_data)

eval_data = {
    'review': [
        'Kameranya luar biasa, detail foto sangat tajam.',
        'Baterai cepat sekali habis, bikin kecewa.',
        'Desain ponselnya keren, ringan dan tipis.',
        'Sinyal sering hilang, sangat merepotkan.',
        'Layar AMOLED-nya jernih, nyaman dipakai nonton.',
        'Charger tidak berfungsi, produk cacat.',
        'Speaker suaranya mantap, bass terasa.',
        'Aplikasi sering force close, tidak stabil.',
        'Body kokoh, terasa premium di tangan.',
        'Harga mahal tapi kualitas buruk.'
    ]
}
eval_df = pd.DataFrame(eval_data)

In [None]:
# ------------- resources -------------
# Pastikan Anda sudah mendownload 'stopwords' NLTK
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')
except nltk.downloader.DownloadEvicted:
    nltk.download('stopwords')

stop_words = set(stopwords.words('indonesian'))

# JANGAN hapus kata negasi â€” keluarkan dari stopwords
negation_words = {'tidak', 'bukan', 'jangan', 'belum', 'kurang'}
stop_words = stop_words - negation_words

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
# ------------- preproc (keep negation) -------------
def clean_text_keep_neg(text):
    text = text.lower()
    # pertahankan spasi dan huruf, tapi hapus angka/simbol kecuali '-'
    text = re.sub(r'[^a-z\s\-]', ' ', text)
    # collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_and_stem(text):
    # tokenisasi sederhana, lalu hapus stopwords (tetap menyimpan negasi)
    toks = text.split()
    toks = [t for t in toks if t not in stop_words]
    stems = [stemmer.stem(t) for t in toks]
    return stems

In [None]:
# Apply preprocessing to training + eval
df['clean'] = df['review'].apply(clean_text_keep_neg)
df['tokens'] = df['clean'].apply(tokenize_and_stem)

eval_df['clean'] = eval_df['review'].apply(clean_text_keep_neg)
eval_df['tokens'] = eval_df['clean'].apply(tokenize_and_stem)

# ------------- buat lexicon sederhana (bisa dikembangkan) -------------
pos_lex = ['bagus','awet','jernih','kencang','premium','elegan','aman','mantap','tajam','nyaman','mantap','keren','mantap','baik','bagus']
neg_lex = ['rusak','buruk','kecewa','jelek','hilang','panas','ganggu','cacat','tidak','pengecew','gagal','tidakfungsi','tidak_berfungsi','tidak_ada']

# stem lexicon supaya cocok dengan hasil stemmer
pos_lex_stem = set([stemmer.stem(w) for w in pos_lex])
neg_lex_stem = set([stemmer.stem(w) for w in neg_lex])

In [None]:
# --- PERBAIKAN ---
# Ganti lexicon_score dengan hitungan positif dan negatif terpisah (non-negatif)
def lexicon_counts(stems):
    pos_count = sum(1 for w in stems if w in pos_lex_stem)
    neg_count = sum(1 for w in stems if w in neg_lex_stem)
    return pos_count, neg_count

def has_negation(text):
    for neg in negation_words:
        if f' {neg} ' in f' {text} ':  # crude check
            return 1
    return 0

In [None]:
# Apply new lexicon features
df['pos_score'], df['neg_score'] = zip(*df['tokens'].apply(lexicon_counts))
df['has_neg'] = df['clean'].apply(has_negation)

eval_df['pos_score'], eval_df['neg_score'] = zip(*eval_df['tokens'].apply(lexicon_counts))
eval_df['has_neg'] = eval_df['clean'].apply(has_negation)

# ------------- vectorization TF-IDF dengan n-grams -------------
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.85, sublinear_tf=True)
X_tfidf = vectorizer.fit_transform(df['clean'])
y = df['sentiment'].values

# tambahkan fitur numerik pos_score, neg_score, & has_neg ke matriks TF-IDF
# --- PERBAIKAN ---
# Gunakan pos_score dan neg_score yang non-negatif
X_extra = csr_matrix(np.vstack([df['pos_score'].values, df['neg_score'].values, df['has_neg'].values]).T)
X = hstack([X_tfidf, X_extra])

In [None]:
# ------------- GridSearch / CV untuk NB dan SVM -------------
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# NB tuning (sekarang seharusnya tidak error karena input non-negatif)
nb = MultinomialNB()
params_nb = {'alpha': [0.01, 0.1, 0.5, 1.0]} # Added more alpha values for tuning
gs_nb = GridSearchCV(nb, params_nb, cv=cv, scoring='accuracy')
gs_nb.fit(X, y)
best_nb = gs_nb.best_estimator_

# SVM tuning (gunakan class_weight dan C)
svm = LinearSVC(max_iter=5000, dual=True) # dual=True is often faster for n_samples > n_features
params_svm = {'C': [0.01, 0.1, 1, 10, 100], 'class_weight': [None, 'balanced']} # Added C=100
gs_svm = GridSearchCV(svm, params_svm, cv=cv, scoring='accuracy')
gs_svm.fit(X, y)
best_svm = gs_svm.best_estimator_

print("Best NB:", gs_nb.best_params_, "cv score:", gs_nb.best_score_)
print("Best SVM:", gs_svm.best_params_, "cv score:", gs_svm.best_score_)

Best NB: {'alpha': 0.01} cv score: 0.875
Best SVM: {'C': 0.01, 'class_weight': None} cv score: 0.875


In [None]:
# ------------- Ensemble voting (hard) -------------
# Train the best models on the full training data before ensembling
best_nb.fit(X, y)
best_svm.fit(X, y)

voting = VotingClassifier(estimators=[('nb', best_nb), ('svm', best_svm)], voting='hard')
voting.fit(X, y)

# ------------- prepare eval features -------------
X_eval_tfidf = vectorizer.transform(eval_df['clean'])
# --- PERBAIKAN ---
# Gunakan pos_score dan neg_score non-negatif untuk data evaluasi
X_eval_extra = csr_matrix(np.vstack([eval_df['pos_score'].values, eval_df['neg_score'].values, eval_df['has_neg'].values]).T)
X_eval = hstack([X_eval_tfidf, X_eval_extra])

# Prediksi
eval_df['NB_pred'] = best_nb.predict(X_eval)
eval_df['SVM_pred'] = best_svm.predict(X_eval)
eval_df['VOTE_pred'] = voting.predict(X_eval)

# label mapping
label_map = {0: 'Negatif', 1: 'Positif'}
for col in ['NB_pred','SVM_pred','VOTE_pred']:
    eval_df[col] = eval_df[col].map(label_map)

print("\n=== Hasil Prediksi Sentimen pada Dataset Evaluasi ===\n")
print(eval_df[['review','NB_pred','SVM_pred','VOTE_pred']].to_string(index=False))


=== Hasil Prediksi Sentimen pada Dataset Evaluasi ===

                                         review NB_pred SVM_pred VOTE_pred
Kameranya luar biasa, detail foto sangat tajam. Positif  Positif   Positif
      Baterai cepat sekali habis, bikin kecewa. Negatif  Negatif   Negatif
      Desain ponselnya keren, ringan dan tipis. Positif  Positif   Positif
       Sinyal sering hilang, sangat merepotkan. Negatif  Negatif   Negatif
Layar AMOLED-nya jernih, nyaman dipakai nonton. Positif  Positif   Positif
         Charger tidak berfungsi, produk cacat. Negatif  Negatif   Negatif
          Speaker suaranya mantap, bass terasa. Positif  Positif   Positif
     Aplikasi sering force close, tidak stabil. Negatif  Negatif   Negatif
          Body kokoh, terasa premium di tangan. Positif  Positif   Positif
               Harga mahal tapi kualitas buruk. Negatif  Negatif   Negatif


In [None]:
# ------------- analisis kesalahan pada data train (contoh) -------------
# training predictions to inspect misclassifications
# Use the voting classifier for misclassification analysis
y_pred_train = voting.predict(X)
# Find indices where prediction is not equal to true label
mis_idx = np.where(y_pred_train != y)[0]
print("\n-- Misclassified in train set (index, true, pred, review) --")
if len(mis_idx) > 0:
    for i in mis_idx:
        print(i, y[i], y_pred_train[i], df.loc[i, 'review'])
else:
    print("No misclassifications found in the training set.")


-- Misclassified in train set (index, true, pred, review) --
4 1 0 Mantap, sesuai ekspektasi. Tidak mengecewakan.


In [None]:
# ------------- inspection: top features dari SVM -------------
# hanya jika SVM terlatih linier -> ambil coef
try:
    feat_names = list(vectorizer.get_feature_names_out()) + ['pos_score','neg_score','has_neg'] # Update feature names
    coef = best_svm.coef_[0]
    top_pos = np.argsort(coef)[-15:][::-1]
    top_neg = np.argsort(coef)[:15]
    print("\nTop positive features (SVM):")
    for idx in top_pos:
        print(feat_names[idx], f"({coef[idx]:.3f})")
    print("\nTop negative features (SVM):")
    for idx in top_neg:
        print(feat_names[idx], f"({coef[idx]:.3f})")
except Exception as e:
    print("Tidak bisa mengekstrak fitur SVM:", e)


Top positive features (SVM):
pos_score (0.126)
tidak mengecewakan (0.007)
sesuai ekspektasi (0.007)
ekspektasi (0.007)
ekspektasi tidak (0.007)
mantap (0.007)
mantap sesuai (0.007)
mengecewakan (0.007)
pengiriman cepat (0.005)
packing (0.005)
pengiriman (0.005)
packing aman (0.005)
aman produk (0.005)
aman (0.005)
cepat packing (0.005)

Top negative features (SVM):
neg_score (-0.100)
has_neg (-0.010)
kualitas suara (-0.006)
kualitas (-0.006)
suara (-0.006)
speakernya jelek (-0.006)
jelek sekali (-0.006)
jelek (-0.006)
suara speakernya (-0.006)
speakernya (-0.006)
sekali (-0.006)
merespon sangat (-0.005)
mengganggu (-0.005)
merespon (-0.005)
layar sering (-0.005)
