In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
df = pd.read_csv('/content/drive/MyDrive/MKA UGM/AI/project1/tweet_preprocessed_2.csv')

In [None]:
df.head()

Unnamed: 0,sentimen,tweet,labels
0,negatif,kata indonesia tidak harga bangsa asing berita...,0
1,netral,batu langka tasbih jokowi hadiah dari habib lu...,1
2,netral,di era jokowi ekonomi indonesia makin baik pic...,1
3,positif,bagi sumatra selatan asi games dampak pada eko...,2
4,negatif,negara kita utang buat bangun infrastruktur ya...,0


## Modeling

### Split data

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['tweet'], df['labels'], test_size=0.2, random_state=42
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.25, random_state=42  # 0.25 * 0.8 = 0.2
)

print(f"Training data: {len(train_texts)}")
print(f"Validation data: {len(val_texts)}")
print(f"Testing data: {len(test_texts)}")

Training data: 1089
Validation data: 363
Testing data: 363


In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['sentimen'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['sentimen'], random_state=42)

In [None]:
# Menghitung total baris dari masing-masing set
train_total = train_df.shape[0]
val_total = val_df.shape[0]
test_total = test_df.shape[0]

# Menghitung jumlah data per label untuk setiap set
train_counts = train_df['sentimen'].value_counts().sort_index()
val_counts = val_df['sentimen'].value_counts().sort_index()
test_counts = test_df['sentimen'].value_counts().sort_index()

# Menampilkan hasil
print(f'Total data in training set: {train_total}')
print(train_counts)
print()
print(f'Total data in validation set: {val_total}')
print(val_counts)
print()
print(f'Total data in testing set: {test_total}')
print(test_counts)

Total data in training set: 1452
sentimen
negatif    477
netral     486
positif    489
Name: count, dtype: int64

Total data in validation set: 181
sentimen
negatif    60
netral     60
positif    61
Name: count, dtype: int64

Total data in testing set: 182
sentimen
negatif    59
netral     61
positif    62
Name: count, dtype: int64


### Tokenisasi

In [None]:
max_words = 10000  # Jumlah kata yang akan dipertahankan
max_len = 100  # Panjang sekuens tweet

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

train_data = pad_sequences(train_sequences, maxlen=max_len)
val_data = pad_sequences(val_sequences, maxlen=max_len)
test_data = pad_sequences(test_sequences, maxlen=max_len)


In [None]:
train_data

array([[   0,    0,    0, ...,  280,    7,  435],
       [   0,    0,    0, ...,   67,   42,   23],
       [   0,    0,    0, ...,   33,  658,    1],
       ...,
       [   0,    0,    0, ..., 3478,  322,  229],
       [   0,    0,    0, ...,  125,   18,  129],
       [   0,    0,    0, ...,    8,   60, 3489]], dtype=int32)

In [None]:
train_labels = to_categorical(train_labels)
val_labels = to_categorical(val_labels)
test_labels = to_categorical(test_labels)

In [None]:
train_labels

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

### Model CNN-BiLSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout

embedding_dim = 100  # Ukuran embedding

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=4),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # Output layer untuk 3 kelas sentimen
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           64128     
                                                                 
 max_pooling1d (MaxPooling1  (None, 24, 128)           0         
 D)                                                              
                                                                 
 lstm (LSTM)                 (None, 24, 64)            49408     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 64)                2112      
                                                        

### Training Model

In [None]:
history = model.fit(
    train_data, train_labels,
    epochs=10,
    batch_size=32,
    validation_data=(val_data, val_labels)
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluasi model

In [None]:
loss, accuracy = model.evaluate(test_data, test_labels)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')


NameError: name 'model' is not defined

### Fine tuning

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping

import tensorflow as tf

embedding_dim = 100  # Ukuran embedding
regularizer = l1_l2(l1=1e-6, l2=0.0)

model_1 = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(filters=128, kernel_size=5, activation='relu',kernel_regularizer=regularizer),
    MaxPooling1D(pool_size=4),
    LSTM(64, return_sequences=True),
    Dropout(0.1),
    LSTM(32, kernel_regularizer=regularizer),
    Dense(64, activation='relu', kernel_regularizer=regularizer),
    Dropout(0.1),
    Dense(3, activation='softmax')  # Output layer untuk 3 kelas sentimen
])

model_1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy'])

model_1.summary()

# Early stopping callback
# early_stopping = EarlyStopping(
#     monitor='val_accuracy',  # Memantau metrik val_accuracy
#     patience=10,              # Berhenti setelah 3 epoch tanpa peningkatan
#     min_delta=0.01,           # Perubahan minimal dalam val_accuracy yang dianggap signifikan
#     verbose=1,                # Menampilkan pesan saat menghentikan pelatihan
#     mode='max',               # Mode 'max' untuk memantau peningkatan val_accuracy
#     baseline=0.75,             # Baseline val_accuracy yang harus dicapai atau dilewati
#     restore_best_weights=True  # Mengembalikan bobot terbaik saat berhenti
# )
callback = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_22 (Embedding)    (None, 100, 100)          1000000   
                                                                 
 conv1d_22 (Conv1D)          (None, 96, 128)           64128     
                                                                 
 max_pooling1d_22 (MaxPooli  (None, 24, 128)           0         
 ng1D)                                                           
                                                                 
 lstm_45 (LSTM)              (None, 24, 64)            49408     
                                                                 
 dropout_34 (Dropout)        (None, 24, 64)            0         
                                                                 
 lstm_46 (LSTM)              (None, 32)                12416     
                                                     

In [None]:
history = model_1.fit(
    train_data, train_labels,
    epochs=50,  # Meningkatkan epoch, karena kita menggunakan early stopping
    batch_size=20,
    validation_data=(val_data, val_labels),
    callbacks=[early_stopping]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 10: early stopping


In [None]:
loss, accuracy = model_1.evaluate(test_data, test_labels)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')
