In [44]:
import tensorflow as tf
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re
# import nltk
# nltk.download('stopwords')

In [45]:
df = pd.read_csv('Dataset Twitter Fix - Indonesian Sentiment Twitter Dataset Labeled.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12647 entries, 0 to 12646
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sentimen  12646 non-null  float64
 1   Tweet     12647 non-null  object 
dtypes: float64(1), object(1)
memory usage: 197.7+ KB


In [46]:
# Checking for null values
df.isna().sum()

sentimen    1
Tweet       0
dtype: int64

In [47]:
# Import Module
from sklearn.impute import SimpleImputer

# Pembuatan Imputation dengan pengganti modus
impute_modes = SimpleImputer(strategy='most_frequent')

df['sentimen'] = impute_modes.fit_transform(df[['sentimen']])

In [48]:
# Checking for null values
df.isna().sum()

sentimen    0
Tweet       0
dtype: int64

In [49]:
df = df.loc[df['sentimen'].isin((0.0, 1.0, 2.0))]
tweets = df['Tweet']
tweets.head()

0    barusan liat tulisan di belakang truk rela inj...
1    her itu lho miss kevin sama keven rebutan gimb...
2      iya rep gatau aku masih kelas 4 sd ehh di block
3      aku mohon tepatilah janjimu penantiancintaeps19
4    bukan beria nk kahwin sbb gatal celah kangkang...
Name: Tweet, dtype: object

In [50]:
def custom_standardisation(input_data: str):
    # remove tag
    input_data = re.sub(r"(URL)|(USER)", '', input_data)
    # remove unicode and newline
    input_data = re.sub(r"[\\]x\w{2}|'|\\n", '', input_data)
    # remove non alphanumeric character
    input_data = re.sub(r'[^a-zA-Z ]+', '', input_data)
    # remove single char
    input_data = re.sub(r'\b[a-zA-Z]\b', '', input_data)
    input_data = input_data.casefold()
    input_data = input_data.strip()
    # remove multiple spaces
    input_data = re.sub(r' {2,}', ' ', input_data)
    return input_data

tweets = tweets.apply(custom_standardisation)
tweets.head()

0    barusan liat tulisan di belakang truk rela inj...
1    her itu lho miss kevin sama keven rebutan gimb...
2        iya rep gatau aku masih kelas sd ehh di block
3        aku mohon tepatilah janjimu penantiancintaeps
4    bukan beria nk kahwin sbb gatal celah kangkang...
Name: Tweet, dtype: object

In [51]:
singkatan_df = pd.read_csv('kamus_singkatan.csv', index_col='singkatan')
singkatan_dict = singkatan_df.iloc[:, 1].to_dict()

def expand_abbr(text: str):
    return ' '.join((singkatan_dict.get(word, word) for word in text.split()))

tweets = tweets.apply(expand_abbr)
tweets.head()

0    barusan liat tulisan di belakang truk rela inj...
1    her itu lho miss kevin sama keven rebutan gimb...
2       iya rep gatau saya masih kelas sd ehh di block
3       saya mohon tepatilah janjimu penantiancintaeps
4    bukan beria nk kahwin sbb gatal celah kangkang...
Name: Tweet, dtype: object

In [52]:
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
#
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()
#
# def stem_wrapper(text: str):
#     return stemmer.stem(text)
#
# for i,tweet in enumerate(tweets):
#     tweets[i] = stem_wrapper(tweet)
#     print(i)
# # tweets = tweets.apply(stem_wrapper)
# tweets.head()

In [53]:
df = pd.read_csv('cleaned_twitter.csv')
df.head()

Unnamed: 0,sentimen,Tweet
0,0.0,barusan liat tulis di belakang truk rela injek...
1,0.0,her itu lho miss kevin sama keven rebut gimbot...
2,0.0,iya rep gatau saya masih kelas sd ehh di block
3,0.0,saya mohon tepat janji penantiancintaeps
4,0.0,bukan ria nk kahwin sbb gatal celah kangkang t...


In [54]:
max_len = 0
max_words = ''
unique_words = set()
for tweet in tweets.str.split():
    if not tweet:
        continue
    tweet_max = max(tweet, key=len)
    unique_words.update(tweet)
    if max_len < len(tweet_max):
        max_len = len(tweet_max)
        max_words = tweet_max

print(f"{max_len=}\n{max_words=}\n{len(unique_words)=}")

max_len=39
max_words='sejukatgasnusantaraatgasnusantarakaltim'
len(unique_words)=24061


In [55]:
df_backup = df.copy()
df_backup['Tweet'] = tweets
df_backup.to_csv('cleaned_twitter.csv', index=False)

In [56]:
stopwords_list = stopwords.words('indonesian')

# convert list to dictionary
list_stopwords = set(stopwords_list)
list_stopwords

{'ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [57]:
#remove stopword pada list token
def stopwords_removal(words):
    return ' '.join([word for word in words.split() if word not in list_stopwords])
    # word_list = []
    # for word in words.split():
    #     if word not in list_stopwords:
    #         word_list.append(list(word))
    # return word_list

tweets = tweets.apply(stopwords_removal)
tweets.head()

0    barusan liat tulisan truk rela injek kopling s...
1    her lho miss kevin keven rebutan gimbot ya wis...
2                     iya rep gatau kelas sd ehh block
3            mohon tepatilah janjimu penantiancintaeps
4    beria nk kahwin sbb gatal celah kangkang penat...
Name: Tweet, dtype: object

In [58]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

y = df["sentimen"]
#mengubah y menjadi integer
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y)
print(y[0:10])

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [59]:
df['sentimen'].value_counts()

0.0    5328
2.0    4188
1.0    2792
Name: sentimen, dtype: int64

In [60]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenisasi
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets)

In [61]:
# Menampilkan indeks dari setiap kata dalam tweet
print(tokenizer.index_word)



In [62]:
word2vec=tokenizer.word_index
V=len(word2vec)
print('Dataset has %s number of independent tokens' %V)

Dataset has 23476 number of independent tokens


In [63]:
#fit_on_texts() menciptakan hubungan antara kata-kata dan bilangan-bilangan yg di-assign
#hubungan ini disimpan dalam dictionary dalam tokenizer.word_index
#kata-kata harus diganti dengan bilangan-bilangan yang di-assign
encoded_comments = tokenizer.texts_to_sequences(tweets)
print(encoded_comments[0:10])

[[1564, 44, 521, 1565, 1009, 5921, 5922, 3291], [888, 454, 682, 3292, 4239, 3293, 4240, 2, 1304, 34, 755, 3294, 75, 4241, 208, 47, 4242, 4243], [36, 394, 395, 355, 1756, 941, 1566], [179, 5923, 3295, 3296], [1185, 58, 412, 148, 1421, 3297, 9406, 240, 1, 436, 3, 4244, 278, 54, 71, 5924], [714, 5925, 942, 42, 4245, 942, 78], [39, 490, 35, 9407, 9408, 10, 9409, 45], [113, 3298, 52, 619, 2290, 1010, 279, 204, 789, 1972, 344, 111], [9410], [9411, 143, 21, 248, 1567]]


In [64]:
# # Melihat statistical description serta histogram panjang comments Non-CB
# df['Tweet'].str.split().apply(len).describe()

TypeError: object of type 'float' has no len()

In [65]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Karena setiap kalimat tidak memiliki panjang yang sama, gunakan padding
padded_sequence = pad_sequences(encoded_comments, maxlen=152, padding="post")
X = padded_sequence

In [66]:
print('Shape of X is ', X.shape)
print('Shape of y is', y.shape)

Shape of X is  (12308, 152)
Shape of y is (12308, 3)


In [67]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [68]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255.0
x_test /= 255.0

In [None]:
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D, Embedding, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential

#Create Model
embedding_vector_length = V
vocab_size = len(tokenizer.word_index) + 1
#untuk menentukan size untuk proses embedding, kalimat input sequence huruf akan diubah menjadi word embedding.
#definisi word embedding menyusul
model = Sequential()
#layer embedding
model.add(Embedding(19365, V))
#Yang digunakan adalah CNN 1-dimensi
model.add(Conv1D(64,3,activation="relu"))
model.add(MaxPooling1D(5))
#32 adalah filter, 8 unit convolution,
#setelah convolution, pasti ada max pooling, max pooling 1 dimension
model.add(MaxPooling1D(2,2))
#Dropout adalah bentuk konfigurasi supaya tidak overfitting
model.add(Dropout(0.2))
model.add(Dense(64, activation = "relu"))
model.add(Dropout(0.2))
model.add(MaxPooling1D(2,2))
#model LSTM
#layer utk mengubah sequence index kata menjadi vektor
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
#layer klasifikasi akhir
model.add(Dense(3, activation='softmax'))
#Compile Model
model.compile(loss='binary_crossentropy',optimizer='sgd', metrics=['accuracy'])
# model.compile(loss='binary_crossentropy',optimizer='rmsprop', metrics=['accuracy'])
# model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [70]:
hist = model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test))

Epoch 1/20
  6/308 [..............................] - ETA: 11:06 - loss: 0.6886 - accuracy: 0.3281

KeyboardInterrupt: 