In [93]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.utils import resample
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...


True

In [None]:
df = pd.read_csv('clash_royale_reviews.csv')
sia = SentimentIntensityAnalyzer()

def validate_label_with_lexicon(row):
    score = row['score']
    text = str(row['content'])
    vader_score = sia.polarity_scores(text)['compound']
    
    if score <= 2:
        if vader_score > 0.5:
            return 'invalid'
        return 0
        
    elif score == 3:
        return 1 
        
    elif score >= 4:
        if vader_score < -0.5: 
            return 'invalid'
        return 2

df['validated_label'] = df.apply(validate_label_with_lexicon, axis=1)

initial_len = len(df)
df = df[df['validated_label'] != 'invalid'].copy()
df['label'] = df['validated_label'].astype(int)

print(f"Validasi Selesai.")
print(f"Data dibuang karena tidak relevan (Lexicon Conflict): {initial_len - len(df)} baris.")

stop_words = set(stopwords.words('english'))

def clean_text_deep(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_content'] = df['content'].apply(clean_text_deep)
print(f"Total Data Setelah Validasi Lexicon: {len(df)}")

Validasi Selesai.
Data dibuang karena tidak relevan (Lexicon Conflict): 3424 baris.
Total Data Setelah Validasi Lexicon: 8576


In [95]:
target_size = 4000

df_0 = df[df['label'] == 0]
df_1 = df[df['label'] == 1]
df_2 = df[df['label'] == 2]

from sklearn.utils import resample

df_0_bal = resample(df_0, replace=False, n_samples=target_size, random_state=42)
df_1_bal = resample(df_1, replace=True, n_samples=target_size, random_state=42)
df_2_bal = resample(df_2, replace=True, n_samples=target_size, random_state=42)

df_final = pd.concat([df_0_bal, df_1_bal, df_2_bal])
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

Y_labels = df_final['label'].values 
Y_onehot = pd.get_dummies(df_final['label']).values

print(f"Total Data Final: {len(df_final)} (3.500 per kelas)")

Total Data Final: 12000 (3.500 per kelas)


In [97]:
print("=== SKEMA 1: SVM (Traditional ML) + TF-IDF ===")
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df_final['clean_content']).toarray()

X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
    X_tfidf, Y_labels, test_size=0.2, random_state=42
)

svm_model = SVC(kernel='linear', C=1.5, random_state=42)

print("Sedang melatih SVM... (Mohon tunggu)")
svm_model.fit(X_train_svm, y_train_svm)
y_pred_svm = svm_model.predict(X_test_svm)
acc_train_svm = svm_model.score(X_train_svm, y_train_svm)
acc_test_svm = accuracy_score(y_test_svm, y_pred_svm)

print(f"\nAkurasi Training Set: {acc_train_svm*100:.2f}%")
print(f"Akurasi Testing Set : {acc_test_svm*100:.2f}%")

if acc_test_svm > 0.85:
    print("✅ Kriteria Machine Learning Tradisional (>85%) TERCAPAI!")
else:
    print("⚠️ Coba tuning parameter C pada SVM.")

print("\nClassification Report (SVM):")
print(classification_report(y_test_svm, y_pred_svm, target_names=['Negatif', 'Netral', 'Positif']))

=== SKEMA 1: SVM (Traditional ML) + TF-IDF ===
Sedang melatih SVM... (Mohon tunggu)

Akurasi Training Set: 93.83%
Akurasi Testing Set : 86.12%
✅ Kriteria Machine Learning Tradisional (>85%) TERCAPAI!

Classification Report (SVM):
              precision    recall  f1-score   support

     Negatif       0.86      0.84      0.85       791
      Netral       0.84      0.88      0.86       833
     Positif       0.89      0.86      0.87       776

    accuracy                           0.86      2400
   macro avg       0.86      0.86      0.86      2400
weighted avg       0.86      0.86      0.86      2400



In [98]:
print("=== PERSIAPAN SEQUENCE (Skema 2 & 3) ===")

MAX_WORDS = 10000
MAX_LEN = 120

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df_final['clean_content'])

X_seq = tokenizer.texts_to_sequences(df_final['clean_content'])
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN)

print("Tokenizer siap. Data teks telah diubah menjadi sequence angka.")

=== PERSIAPAN SEQUENCE (Skema 2 & 3) ===
Tokenizer siap. Data teks telah diubah menjadi sequence angka.


In [99]:
print("\n=== SKEMA 2: CNN 1D + Embedding (80:20) ===")

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_pad, Y_onehot, test_size=0.2, random_state=42)

model2 = Sequential()
model2.add(Embedding(MAX_WORDS, 100, input_length=MAX_LEN))
model2.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model2.add(GlobalMaxPooling1D())                                  
model2.add(Dense(64, activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(3, activation='softmax'))

model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history2 = model2.fit(X_train_2, y_train_2, epochs=10, batch_size=64, validation_data=(X_test_2, y_test_2), verbose=1)


=== SKEMA 2: CNN 1D + Embedding (80:20) ===
Epoch 1/10




[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.5708 - loss: 0.9047 - val_accuracy: 0.7833 - val_loss: 0.6108
Epoch 2/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.8481 - loss: 0.4430 - val_accuracy: 0.8888 - val_loss: 0.3500
Epoch 3/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.9559 - loss: 0.1656 - val_accuracy: 0.9250 - val_loss: 0.2447
Epoch 4/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.9873 - loss: 0.0601 - val_accuracy: 0.9304 - val_loss: 0.2446
Epoch 5/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.9959 - loss: 0.0260 - val_accuracy: 0.9300 - val_loss: 0.2656
Epoch 6/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9973 - loss: 0.0150 - val_accuracy: 0.9312 - val_loss: 0.2837
Epoch 7/10
[1m150/150[0m [32m━

In [100]:
print("\n=== EVALUASI SKEMA 2 (CNN 1D) ===")

loss_train2, acc_train2 = model2.evaluate(X_train_2, y_train_2, verbose=0)
loss_test2, acc_test2 = model2.evaluate(X_test_2, y_test_2, verbose=0)

print(f"Akurasi Training Set: {acc_train2*100:.2f}%")
print(f"Akurasi Testing Set : {acc_test2*100:.2f}%")

y_pred_2 = np.argmax(model2.predict(X_test_2, verbose=0), axis=1)
y_true_2 = np.argmax(y_test_2, axis=1)

print("\nClassification Report:")
print(classification_report(y_true_2, y_pred_2, target_names=['Negatif', 'Netral', 'Positif']))


=== EVALUASI SKEMA 2 (CNN 1D) ===
Akurasi Training Set: 100.00%
Akurasi Testing Set : 92.83%

Classification Report:
              precision    recall  f1-score   support

     Negatif       0.93      0.88      0.91       791
      Netral       0.94      0.97      0.96       833
     Positif       0.91      0.93      0.92       776

    accuracy                           0.93      2400
   macro avg       0.93      0.93      0.93      2400
weighted avg       0.93      0.93      0.93      2400



In [101]:
print("\n=== SKEMA 3: Hybrid CNN + LSTM (85:15) ===")

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_pad, Y_onehot, test_size=0.15, random_state=555)

model3 = Sequential()
model3.add(Embedding(MAX_WORDS, 100, input_length=MAX_LEN))
model3.add(SpatialDropout1D(0.4))
model3.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model3.add(GlobalMaxPooling1D())
model3.add(Dense(64, activation='relu'))
model3.add(Dropout(0.4))
model3.add(Dense(3, activation='softmax'))
opt = Adam(learning_rate=0.0005)
model3.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

checkpoint = ModelCheckpoint("model_hybrid.keras", monitor='val_accuracy', verbose=0, save_best_only=True, mode='max')

history3 = model3.fit(X_train_3, y_train_3, epochs=20, batch_size=32, validation_data=(X_test_3, y_test_3), callbacks=[checkpoint], verbose=1)


=== SKEMA 3: Hybrid CNN + LSTM (85:15) ===
Epoch 1/20




[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.5102 - loss: 0.9797 - val_accuracy: 0.6550 - val_loss: 0.7702
Epoch 2/20
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.7396 - loss: 0.6447 - val_accuracy: 0.7906 - val_loss: 0.5562
Epoch 3/20
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.8639 - loss: 0.4018 - val_accuracy: 0.8628 - val_loss: 0.4207
Epoch 4/20
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9260 - loss: 0.2333 - val_accuracy: 0.8861 - val_loss: 0.3464
Epoch 5/20
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9559 - loss: 0.1480 - val_accuracy: 0.8983 - val_loss: 0.3376
Epoch 6/20
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9750 - loss: 0.0897 - val_accuracy: 0.9044 - val_loss: 0.3407
Epoch 7/20
[1m319/319[0m [32m━

In [102]:
print("\n=== EVALUASI SKEMA 3 (HYBRID) ===")
model3.load_weights("model_hybrid.keras")

loss_train3, acc_train3 = model3.evaluate(X_train_3, y_train_3, verbose=0)
loss_test3, acc_test3 = model3.evaluate(X_test_3, y_test_3, verbose=0)

print(f"Akurasi Training Set: {acc_train3*100:.2f}%")
print(f"Akurasi Testing Set : {acc_test3*100:.2f}%")

if acc_test3 > 0.92:
    print("✅ TARGET HIGH SCORE TERCAPAI (>92%)")
else:
    print("⚠️ Masih di bawah target, coba run ulang.")

y_pred_3 = np.argmax(model3.predict(X_test_3, verbose=0), axis=1)
y_true_3 = np.argmax(y_test_3, axis=1)

print("\nClassification Report:")
print(classification_report(y_true_3, y_pred_3, target_names=['Negatif', 'Netral', 'Positif']))


=== EVALUASI SKEMA 3 (HYBRID) ===
Akurasi Training Set: 100.00%
Akurasi Testing Set : 92.17%
✅ TARGET HIGH SCORE TERCAPAI (>92%)

Classification Report:
              precision    recall  f1-score   support

     Negatif       0.93      0.87      0.90       595
      Netral       0.93      0.96      0.95       581
     Positif       0.90      0.94      0.92       624

    accuracy                           0.92      1800
   macro avg       0.92      0.92      0.92      1800
weighted avg       0.92      0.92      0.92      1800



In [103]:
import pickle

# 1. Simpan Model 2 (CNN 1D)
model2.save('model_cnn_clash_royale.keras')
print("✅ Model 2 berhasil disimpan sebagai 'model_cnn_clash_royale.keras'")

# 2. Simpan Tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("✅ Tokenizer berhasil disimpan sebagai 'tokenizer.pickle'")

✅ Model 2 berhasil disimpan sebagai 'model_cnn_clash_royale.keras'
✅ Tokenizer berhasil disimpan sebagai 'tokenizer.pickle'
