# Install & Import Dependencies

In [8]:
!pip install ipython-autotime -q
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 3.23 s (started: 2025-02-06 12:12:45 +00:00)


In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Concatenate
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import AdamW
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

time: 998 µs (started: 2025-02-06 12:12:49 +00:00)


# Load Dataset

In [10]:
url = 'https://raw.githubusercontent.com/Fake-Sentiment-Review-Detection-Spiill/spiill-data-and-utils/refs/heads/main/spiill_reviews_preprocessed2.csv'
df_reviews = pd.read_csv(url)
df_reviews.tail(3)

Unnamed: 0,username,rate,date,label,sentimen,reviews,produk,no_link_hashtag_reviews,no_emojis_reviews,no_html_reviews,no_special_char_reviews,no_number_reviews,no_punct_reviews,no_whitespace_reviews_reviews,lowercase_reviews,tokens_reviews,no_stopwords_reviews,normalized_reviews,stemmed_reviews
5897,eiswahyudi,3,2024-03-17,non,neutral,Pasar saham Indonesia menunjukkan tren yang st...,Masker Skintific,Pasar saham Indonesia menunjukkan tren yang st...,Pasar saham Indonesia menunjukkan tren yang st...,Pasar saham Indonesia menunjukkan tren yang st...,Pasar saham Indonesia menunjukkan tren yang st...,Pasar saham Indonesia menunjukkan tren yang st...,Pasar saham Indonesia menunjukkan tren yang st...,Pasar saham Indonesia menunjukkan tren yang st...,pasar saham indonesia menunjukkan tren yang st...,"['pasar', 'saham', 'indonesia', 'menunjukkan',...","['pasar', 'saham', 'indonesia', 'tren', 'stabil']","['pasar', 'saham', 'indonesia', 'tren', 'stabil']","['pasar', 'saham', 'indonesia', 'tren', 'stabil']"
5898,dkusumo,5,2023-09-18,non,neutral,Penggunaan kendaraan listrik mulai diminati di...,Masker Skintific,Penggunaan kendaraan listrik mulai diminati di...,Penggunaan kendaraan listrik mulai diminati di...,Penggunaan kendaraan listrik mulai diminati di...,Penggunaan kendaraan listrik mulai diminati di...,Penggunaan kendaraan listrik mulai diminati di...,Penggunaan kendaraan listrik mulai diminati di...,Penggunaan kendaraan listrik mulai diminati di...,penggunaan kendaraan listrik mulai diminati di...,"['penggunaan', 'kendaraan', 'listrik', 'mulai'...","['penggunaan', 'kendaraan', 'listrik', 'dimina...","['penggunaan', 'kendaraan', 'listrik', 'dimina...","['guna', 'kendara', 'listrik', 'mati', 'kota']"
5899,adisti,3,2023-09-11,non,neutral,Lumayan lancar dengan berjalan baik,Masker Skintific,Lumayan lancar dengan berjalan baik,Lumayan lancar dengan berjalan baik,Lumayan lancar dengan berjalan baik,Lumayan lancar dengan berjalan baik,Lumayan lancar dengan berjalan baik,Lumayan lancar dengan berjalan baik,Lumayan lancar dengan berjalan baik,lumayan lancar dengan berjalan baik,"['lumayan', 'lancar', 'dengan', 'berjalan', 'b...","['lumayan', 'lancar', 'berjalan']","['lumayan', 'lancar', 'berjalan']","['lumayan', 'lancar', 'jalan']"


time: 358 ms (started: 2025-02-06 12:12:49 +00:00)


In [11]:
df_reviews.sentimen.value_counts()

sentimen
positive    3686
neutral     1352
negative     862
Name: count, dtype: int64

time: 3.47 ms (started: 2025-02-06 12:12:49 +00:00)


In [12]:
df_reviews.label.value_counts()

label
trusted    3912
non        1150
fake        838
Name: count, dtype: int64

time: 11.5 ms (started: 2025-02-06 12:12:49 +00:00)


# Modeling

## Model Function Preparation

In [13]:
def preprocess_data(df):
    # Convert string representations of lists to actual lists
    def safe_eval(x):
        try:
            if isinstance(x, list):
                return x
            elif isinstance(x, str):
                # Remove square brackets and split by comma
                x = x.strip('[]')
                # Split by comma and clean up each word
                words = [word.strip().strip("'").strip('"') for word in x.split(',')]
                return [word for word in words if word]
            return []
        except:
            return []

    # Convert stemmed_reviews to list format
    df['stemmed_reviews'] = df['stemmed_reviews'].apply(safe_eval)
    
    # Create text sequences
    texts = df['stemmed_reviews'].tolist()
    
    # Train Word2Vec model
    word2vec_model = Word2Vec(texts, vector_size=100, window=5, min_count=1, workers=4)
    
    # Create tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    
    # Convert texts to sequences
    sequences = tokenizer.texts_to_sequences(texts)
    
    # Pad sequences
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    
    # Create embedding matrix
    vocab_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, 100))
    
    for word, i in tokenizer.word_index.items():
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]
    
    return padded_sequences, embedding_matrix, vocab_size, max_len, tokenizer

def create_cnn_model(vocab_size, embedding_matrix, max_len):
    # Input layer
    input_layer = Input(shape=(max_len,))
    
    # Embedding layer with pre-trained weights
    embedding_layer = Embedding(
        vocab_size,
        100,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False
    )(input_layer)
    
    # First Conv1D Layer
    conv1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
    pool1 = MaxPooling1D(pool_size=2)(conv1)
    
    # Second Conv1D Layer
    conv2 = Conv1D(filters=128, kernel_size=5, activation='relu')(pool1)
    global_pool = GlobalMaxPooling1D()(conv2)
    
    # Output layers
    sentiment_output = Dense(3, activation='softmax', name='sentiment')(global_pool)
    label_output = Dense(3, activation='softmax', name='label')(global_pool)
    
    # Create model
    model = Model(
        inputs=input_layer,
        outputs=[sentiment_output, label_output]
    )
    
    # Initialize AdamW optimizer with learning rate
    optimizer = AdamW(learning_rate=1e-5)
    
    # Compile model
    model.compile(
        optimizer=optimizer,
        loss={
            'sentiment': 'categorical_crossentropy',
            'label': 'categorical_crossentropy'
        },
        metrics={
            'sentiment': 'accuracy',
            'label': 'accuracy'
        }
    )
    
    return model

def train_cnn_model(df):
    # Preprocess text data
    X, embedding_matrix, vocab_size, max_len, tokenizer = preprocess_data(df)
    
    # Prepare labels
    sentiment_encoder = LabelEncoder()
    label_encoder = LabelEncoder()
    
    y_sentiment = sentiment_encoder.fit_transform(df['sentimen'])
    y_label = label_encoder.fit_transform(df['label'])
    
    # Convert to categorical
    y_sentiment = tf.keras.utils.to_categorical(y_sentiment)
    y_label = tf.keras.utils.to_categorical(y_label)
    
    # Split data
    X_train, X_test, y_sentiment_train, y_sentiment_test, y_label_train, y_label_test = train_test_split(
        X, y_sentiment, y_label, test_size=0.2, random_state=42
    )
    
    # Define callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        mode='min'
    )
    
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=5,
        min_lr=1e-7,
        mode='min'
    )
    
    # Create and train model
    model = create_cnn_model(vocab_size, embedding_matrix, max_len)
    
    history = model.fit(
        X_train,
        {
            'sentiment': y_sentiment_train,
            'label': y_label_train
        },
        validation_data=(
            X_test,
            {
                'sentiment': y_sentiment_test,
                'label': y_label_test
            }
        ),
        epochs=1000,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    return (model, history, sentiment_encoder, label_encoder, tokenizer, max_len,
            X_test, y_sentiment_test, y_label_test)

def evaluate_model(model, X_test, y_sentiment_test, y_label_test, sentiment_encoder, label_encoder):
    # Get predictions
    sentiment_pred, label_pred = model.predict(X_test)
    
    # Convert predictions to class labels
    sentiment_pred_classes = np.argmax(sentiment_pred, axis=1)
    label_pred_classes = np.argmax(label_pred, axis=1)
    
    sentiment_true_classes = np.argmax(y_sentiment_test, axis=1)
    label_true_classes = np.argmax(y_label_test, axis=1)
    
    # Get original class names
    sentiment_classes = sentiment_encoder.classes_
    label_classes = label_encoder.classes_
    
    # Print Classification Reports
    print("\nSentiment Classification Report:")
    print(classification_report(sentiment_true_classes, sentiment_pred_classes, 
                              target_names=sentiment_classes))
    
    print("\nLabel Classification Report:")
    print(classification_report(label_true_classes, label_pred_classes, 
                              target_names=label_classes))
    
    # Create Confusion Matrices
    def plot_confusion_matrix(true_classes, pred_classes, classes, title):
        cm = confusion_matrix(true_classes, pred_classes)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=classes, yticklabels=classes)
        plt.title(title)
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.show()
    
    # Plot Confusion Matrices
    plot_confusion_matrix(sentiment_true_classes, sentiment_pred_classes, 
                         sentiment_classes, 'Sentiment Confusion Matrix')
    plot_confusion_matrix(label_true_classes, label_pred_classes, 
                         label_classes, 'Label Confusion Matrix')
    
    return {
        'sentiment_report': classification_report(sentiment_true_classes, 
                                                sentiment_pred_classes, 
                                                target_names=sentiment_classes, 
                                                output_dict=True),
        'label_report': classification_report(label_true_classes, 
                                            label_pred_classes, 
                                            target_names=label_classes, 
                                            output_dict=True),
        'sentiment_cm': confusion_matrix(sentiment_true_classes, sentiment_pred_classes),
        'label_cm': confusion_matrix(label_true_classes, label_pred_classes)
    }

def predict_text(text, model, tokenizer, sentiment_encoder, label_encoder, max_len):
    # Preprocess input text
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post')
    
    # Make prediction
    sentiment_pred, label_pred = model.predict(padded)
    
    # Convert predictions to classes
    sentiment = sentiment_encoder.inverse_transform([np.argmax(sentiment_pred[0])])[0]
    label = label_encoder.inverse_transform([np.argmax(label_pred[0])])[0]
    
    return sentiment, label

time: 3.71 ms (started: 2025-02-06 12:12:49 +00:00)


## Train Model

In [None]:
(model, history, sentiment_encoder, label_encoder, tokenizer, max_len,
 X_test, y_sentiment_test, y_label_test) = train_cnn_model(df_reviews)



Epoch 1/1000
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - label_accuracy: 0.6385 - label_loss: 0.9970 - loss: 1.9652 - sentiment_accuracy: 0.5124 - sentiment_loss: 0.9683 - val_label_accuracy: 0.6627 - val_label_loss: 0.8785 - val_loss: 1.5934 - val_sentiment_accuracy: 0.6229 - val_sentiment_loss: 0.7156 - learning_rate: 1.0000e-05
Epoch 2/1000
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - label_accuracy: 0.6706 - label_loss: 0.8729 - loss: 1.5650 - sentiment_accuracy: 0.6163 - sentiment_loss: 0.6921 - val_label_accuracy: 0.6627 - val_label_loss: 0.8680 - val_loss: 1.5391 - val_sentiment_accuracy: 0.6297 - val_sentiment_loss: 0.6720 - learning_rate: 1.0000e-05
Epoch 3/1000
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - label_accuracy: 0.6664 - label_loss: 0.8764 - loss: 1.5386 - sentiment_accuracy: 0.6201 - sentiment_loss: 0.6621 - val_label_accuracy: 0.6627 - val_label_loss: 0.8628 - val_loss

In [None]:
df_reviews

## Training History

In [None]:
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['sentiment_accuracy'], label='sentiment_accuracy')
plt.plot(history.history['val_sentiment_accuracy'], label='val_sentiment_accuracy')
plt.title('Sentiment Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['label_accuracy'], label='label_accuracy')
plt.plot(history.history['val_label_accuracy'], label='val_label_accuracy')
plt.title('Label Accuracy')
plt.legend()
plt.show()

## Evaluate Model Performance

In [None]:
evaluation_metrics = evaluate_model(model, X_test, y_sentiment_test, y_label_test, 
                                  sentiment_encoder, label_encoder)

## Try Predictions

In [None]:
text = "Sayangnya pengiriman lebih lambat dari jadwal. Produk tetap sampai dengan baik, tetapi semoga pengiriman bisa lebih cepat di masa depan."
sentiment, label = predict_text(
    text,
    model,
    tokenizer,
    sentiment_encoder,
    label_encoder,
    max_len
)
print(f"Sentiment: {sentiment}")
print(f"Label: {label}")

# WordCloud

## Positive

In [None]:
df_reviews.head(1)

In [None]:
word_list = []
for text in df_reviews[df_reviews['sentimen']=='positive']['stemmed_reviews']:
  for word in text:
    if word.isnumeric() == False:
      word_list.append(word)
x = Counter(word_list)
print(len(x))
d = {k: v for k, v in sorted(x.items(), key=lambda item: item[0], reverse=False)}
wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(d)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.title('\nPOSITIVE\n')

## Neutral

In [None]:
word_list = []
for text in df_reviews[df_reviews['sentimen']=='neutral']['stemmed_reviews']:
  for word in text:
    if word.isnumeric() == False:
      word_list.append(word)
x = Counter(word_list)
print(len(x))
d = {k: v for k, v in sorted(x.items(), key=lambda item: item[0], reverse=False)}
wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(d)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.title('\nNEUTRAL\n')

## Negative

In [None]:
word_list = []
for text in df_reviews[df_reviews['sentimen']=='negative']['stemmed_reviews']:
  for word in text:
    if word.isnumeric() == False:
      word_list.append(word)
x = Counter(word_list)
print(len(x))
d = {k: v for k, v in sorted(x.items(), key=lambda item: item[0], reverse=False)}
wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(d)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.title('\nNEGATIVE\n')

In [None]:
df_reviews.label.unique()

## Trusted

In [None]:
word_list = []
for text in df_reviews[df_reviews['label']=='trusted']['stemmed_reviews']:
  for word in text:
    if word.isnumeric() == False:
      word_list.append(word)
x = Counter(word_list)
print(len(x))
d = {k: v for k, v in sorted(x.items(), key=lambda item: item[0], reverse=False)}
wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(d)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.title('\nTRUSTED\n')

## Non

In [None]:
word_list = []
for text in df_reviews[df_reviews['label']=='non']['stemmed_reviews']:
  for word in text:
    if word.isnumeric() == False:
      word_list.append(word)
x = Counter(word_list)
print(len(x))
d = {k: v for k, v in sorted(x.items(), key=lambda item: item[0], reverse=False)}
wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(d)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.title('\nNON\n')

## Fake

In [None]:
word_list = []
for text in df_reviews[df_reviews['label']=='fake']['stemmed_reviews']:
  for word in text:
    if word.isnumeric() == False:
      word_list.append(word)
x = Counter(word_list)
print(len(x))
d = {k: v for k, v in sorted(x.items(), key=lambda item: item[0], reverse=False)}
wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(d)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.title('\nNON\n')