In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score


In [6]:
# Load cleaned dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Analisis_Sentimen_DBS/Analisis-Sentimen/cleaned_dataset.csv")
df.dropna(subset=['stemmed_review', 'label'], inplace=True)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])  # hasilkan angka 0,1,2
y_cat = to_categorical(y)  # untuk softmax

# Tokenisasi dan padding
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['stemmed_review'])
sequences = tokenizer.texts_to_sequences(df['stemmed_review'])

max_len = 100  # atau bisa analisis distribusi panjang review
X = pad_sequences(sequences, maxlen=max_len, padding='post')

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y_cat
)


In [7]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))  # 3 kelas: negatif, netral, positif

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train model
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    verbose=1
)




Epoch 1/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 308ms/step - accuracy: 0.7343 - loss: 0.7485 - val_accuracy: 0.7829 - val_loss: 0.6405
Epoch 2/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 239ms/step - accuracy: 0.7653 - loss: 0.6733 - val_accuracy: 0.7829 - val_loss: 0.6381
Epoch 3/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 274ms/step - accuracy: 0.7557 - loss: 0.6890 - val_accuracy: 0.7829 - val_loss: 0.6365
Epoch 4/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 277ms/step - accuracy: 0.7646 - loss: 0.6770 - val_accuracy: 0.7829 - val_loss: 0.6490
Epoch 5/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 253ms/step - accuracy: 0.7585 - loss: 0.6823 - val_accuracy: 0.7829 - val_loss: 0.6421
Epoch 6/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 231ms/step - accuracy: 0.7594 - loss: 0.6723 - val_accuracy: 0.7829 - val_loss: 0.6413
Epoch 7/10
[1m97/97[

In [8]:
# Predict
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# Decode label angka jadi nama
labels = label_encoder.classes_

# Accuracy dan Report
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred, target_names=labels))

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 89ms/step
Accuracy: 0.7683557394002068
Classification Report:
               precision    recall  f1-score   support

     negatif       0.00      0.00      0.00       360
      netral       0.00      0.00      0.00        88
     positif       0.77      1.00      0.87      1486

    accuracy                           0.77      1934
   macro avg       0.26      0.33      0.29      1934
weighted avg       0.59      0.77      0.67      1934



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
def predict_sentiment(text):
    # Preprocessing ringan
    text = text.lower()

    # Tokenisasi dan padding
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post')

    # Prediksi
    pred_prob = model.predict(padded)
    pred_class = np.argmax(pred_prob, axis=1)
    label = label_encoder.inverse_transform(pred_class)

    return label[0]

# Contoh penggunaan
contoh_review = "Aplikasinya sangat berguna dan mudah digunakan"
hasil_sentimen = predict_sentiment(contoh_review)

print(f"Teks: {contoh_review}")
print(f"Prediksi Sentimen: {hasil_sentimen}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
Teks: Aplikasinya sangat berguna dan mudah digunakan
Prediksi Sentimen: positif
