In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from collections import defaultdict
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN, Activation, Flatten
from tensorflow.keras import optimizers, callbacks
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping




# Prepare Dataset

In [2]:
# import dataset
df = pd.read_csv('DATA/train_preprocess.tsv.txt', sep='\t', names=['Tweet','Label'])
df.head()

Unnamed: 0,Tweet,Label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [3]:
df.Label.value_counts()

Label
positive    6416
negative    3436
neutral     1148
Name: count, dtype: int64

In [4]:
# Menghapus duplikat pada kolom "tweet"
df = df.drop_duplicates(subset='Tweet')

# Memeriksa data setelah menghapus duplikat
print("\nData setelah menghapus duplikat:")
print(df)


Data setelah menghapus duplikat:
                                                   Tweet     Label
0      warung ini dimiliki oleh pengusaha pabrik tahu...  positive
1      mohon ulama lurus dan k212 mmbri hujjah partai...   neutral
2      lokasi strategis di jalan sumatera bandung . t...  positive
3      betapa bahagia nya diri ini saat unboxing pake...  positive
4      duh . jadi mahasiswa jangan sombong dong . kas...  negative
...                                                  ...       ...
10993  f - demokrat dorong upaya kemandirian energi n...   neutral
10994                                        tidak bosan  positive
10996  enak rasa masakan nya apalagi kepiting yang me...  positive
10998  pagi pagi di tol pasteur sudah macet parah , b...  negative
10999  meskipun sering belanja ke yogya di riau junct...  positive

[10933 rows x 2 columns]


In [5]:
def cleansing(sent):
    # Mengubah kata menjadi huruf kecil semua dengan menggunakan fungsi lower()
    string = sent.lower()

    # Menghapus emoticon dan tanda baca menggunakan "RegEx" dengan script di bawah
    string = re.sub(r'(?:\@|http?\://|https?\://|www)\S+', '', string) #menghapus https dan http
    string = re.sub('<.*?>', ' ', string) #mengganti karakter html dengan tanda petik
    string = re.sub('[^0-9a-zA-Z]+', ' ', string) #menghilangkan semua karakter yang bukan huruf atau angka dan menggantinya dengan spasi.
    string = re.sub('\n',' ',string) #mengganti line baru dengan spasi
    string = re.sub(r':', ' ', string) #menggantikan karakter : dengan spasi 
    string = re.sub('gue','saya', string) # Mengganti kata "gue" dengan kata "saya"
    string = re.sub(r'\b[a-zA-Z]\b', ' ', string) #menghapus single char
    string = ' '.join(string.split()) #memisahkan dan menggabungkan kata
    string = string.strip() #menghilangkan whitespace di awal dan di akhir teks
    string = re.sub(r'pic.twitter.com.[\w]+', '', string) #menghapus link picture
    string = re.sub(r'\buser\b',' ', string) #menghapus kata 'user'
    string = re.sub(r'\brt\b',' ', string) #menghapus awalan rt
    string = re.sub('RT',' ', string) #menghapus RT simbol
    string = re.sub(r'‚Ä¶', '', string) #menghapus simbol tidak perlu

    # Lematisasi menggunakan Sastrawi
    stemmer_factory = StemmerFactory()
    stemmer = stemmer_factory.create_stemmer()
    string = stemmer.stem(string)

    # Menghapus stop words menggunakan Sastrawi
    stopword_factory = StopWordRemoverFactory()
    stopword_remover = stopword_factory.create_stop_word_remover()
    string = stopword_remover.remove(string)
    
    return string

In [6]:
df['Tweet_Clean'] = df.Tweet.apply(cleansing)
df.head()

Unnamed: 0,Tweet,Label,Tweet_Clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung milik usaha pabrik puluh tahun kenal pu...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,ulama lurus k212 mmbri hujjah partai diwlh sua...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis jalan sumatera bandung nyaman...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia unboxing paket barang bagus beli
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,duh mahasiswa sombong kasih kartu kuning ajar ...


In [7]:
# Group data tweet
positive_tweet = df.loc[df['Label']=='positive'].Tweet.tolist()
negative_tweet = df.loc[df['Label']=='negative'].Tweet.tolist()
neutral_tweet = df.loc[df['Label']=='neutral'].Tweet.tolist()

# Group data label
positive_label = df.loc[df['Label']=='positive'].Label.tolist()
negative_label = df.loc[df['Label']=='negative'].Label.tolist()
neutral_label = df.loc[df['Label']=='neutral'].Label.tolist()

total_data = positive_tweet + negative_tweet + neutral_tweet
labels = positive_label + neutral_label + negative_label

print("Positive: %s, Negative: %s, Neutral: %s" % (len(positive_tweet), len(negative_tweet), len(neutral_tweet)))
print("Total data: %s" % len(total_data))
print("Total labels: %s" % len(labels))

Positive: 6383, Negative: 3412, Neutral: 1138
Total data: 10933
Total labels: 10933


In [9]:
max_features = 10000
tokenizer = Tokenizer(num_words=max_features, split=' ', lower=True)
tokenizer.fit_on_texts(total_data)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("tokenizer.pickle has been created!")

X = tokenizer.texts_to_sequences(total_data)

vocab_size = len(tokenizer.word_index)
maxlen = max(len(x) for x in X)

X = pad_sequences(X)
with open('x_pad_sequences.pickle', 'wb') as handle:
    pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("x_pad_sequences.pickle has been created!")
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("tokenizer.pickle has been created!")

tokenizer.pickle has been created!
x_pad_sequences.pickle has been created!
tokenizer.pickle has been created!


In [10]:
Y = pd.get_dummies(labels)
Y = Y.values

with open('y_labels.pickle', 'wb') as handle:
    pickle.dump(Y, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("y_labels.pickle has created!")

y_labels.pickle has created!


In [11]:
file = open("x_pad_sequences.pickle", 'rb')
X = pickle.load(file)
file.close()

file = open("y_labels.pickle", 'rb')
Y = pickle.load(file)
file.close()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
kf = KFold(n_splits=5,random_state=42,shuffle=True)

accuracies = []

y = Y

embed_dim = 100
units = 64

for iteration, data in enumerate(kf.split(X), start=1):
    data_train   = X[data[0]]
    target_train = y[data[0]]

    data_test    = X[data[1]]
    target_test  = y[data[1]]

    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
    model.add(SimpleRNN(units, dropout=0.2))
    model.add(Dense(3, activation='softmax'))
    sgd = optimizers.Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    adam = optimizers.Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
    history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test), verbose=1, callbacks=[es])

    predictions = model.predict(X_test)
    y_pred = predictions

    accuracy = accuracy_score(y_test.argmax(axis=1), y_pred.argmax(axis=1))

    print("Training -", iteration)
    print(classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1)))
    print("======================================================")

    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)

print()
print()
print()
print("Rata-rata Accuracy: ", round(average_accuracy,4))

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 96, 100)           1000000   
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 64)                10560     
                                                                 
 dense_3 (Dense)             (None, 3)                 195       
                                                                 
Total params: 1010755 (3.86 MB)
Trainable params: 1010755 (3.86 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 2: early stopping
Training - 1
              precision    recall  f1-score   support

           0       0.68      0.69      0.69       681
           1       0.29      0.10      0.15       235
           2       0.85      0.95      0.90      1271