In [1]:
import numpy as np
import pandas as pd
import dataframe_image as dfi
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

<h1>Read Datasets<h1>

In [2]:
df = pd.read_excel('Datasets_Final.xlsx')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4140 entries, 0 to 4139
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4140 non-null   int64 
 1   text_emoji  4140 non-null   object
 2   Sarcasm     4140 non-null   object
dtypes: int64(1), object(2)
memory usage: 97.2+ KB


Unnamed: 0.1,Unnamed: 0,text_emoji,Sarcasm
0,0,mencemooh putus asa marah sejenis menekankan s...,Positif
1,1,jagat jagat mengenal tuhan mengenal mengenal m 😨,Positif
2,2,mbah nun s b yudhoyono j k kubu negarawan poli...,Positif
3,3,mbah nun banci komunis sengkuni sembunyi akun ...,Positif
4,4,mbah nun berlagak sandiwara silakan baca foto ...,Negatif


In [3]:
# Remove unused column/feature
df = df.drop(columns='Unnamed: 0')

In [4]:
df.head()

Unnamed: 0,text_emoji,Sarcasm
0,mencemooh putus asa marah sejenis menekankan s...,Positif
1,jagat jagat mengenal tuhan mengenal mengenal m 😨,Positif
2,mbah nun s b yudhoyono j k kubu negarawan poli...,Positif
3,mbah nun banci komunis sengkuni sembunyi akun ...,Positif
4,mbah nun berlagak sandiwara silakan baca foto ...,Negatif


<h1>Checking Max Length In Every Record</h1>

In [6]:
df['token'] = df['text_emoji'].apply(lambda x: x.split())

In [7]:
df['max_words'] = df['token'].apply(lambda x: len(x))

In [8]:
df['max_words'].describe()

count    4140.000000
mean       11.430193
std         6.560750
min         1.000000
25%         6.000000
50%        10.000000
75%        15.000000
max        39.000000
Name: max_words, dtype: float64

In [9]:
df = df.drop(columns=['token', 'max_words'])

In [10]:
print(f'Total Duplicated Data: {df.duplicated().sum()}')

Total Duplicated Data: 209


In [11]:
df.loc[df.duplicated(), :]

Unnamed: 0,text_emoji,Sarcasm
79,contoh firaun suka menginjak harkat pribumi 😨,Negatif
80,contoh firaun suka menginjak harkat pribumi 😨,Negatif
86,opini cak nun kafir haman qorun suroto 😨,Negatif
115,andre menilai pernyataan cak nun prabowo subia...,Negatif
118,prabowo subianto penyakit basmi prabowo memuji...,Positif
...,...,...
4134,rt pohon kuat guncangan masyaallah sehat anies...,Negatif
4135,gue suka anies baswedan pimpin pakai narasi in...,Negatif
4136,rt orang bugis mendukung anies baswedan capres...,Negatif
4137,contoh kepemimpinan patut contoh tingkat keped...,Negatif


In [12]:
df = df.drop_duplicates(ignore_index = True)

In [13]:
df.head()

Unnamed: 0,text_emoji,Sarcasm
0,mencemooh putus asa marah sejenis menekankan s...,Positif
1,jagat jagat mengenal tuhan mengenal mengenal m 😨,Positif
2,mbah nun s b yudhoyono j k kubu negarawan poli...,Positif
3,mbah nun banci komunis sengkuni sembunyi akun ...,Positif
4,mbah nun berlagak sandiwara silakan baca foto ...,Negatif


In [14]:
print(f'Total Duplicated Data After Handling: {df.duplicated().sum()}')

Total Duplicated Data After Handling: 0


In [None]:
# df_duplicated = pd.DataFrame(df.loc[df['text_emoji'].duplicated()])

In [None]:
# df_duplicated.to_excel('Duplicated_Data.xlsx')

In [15]:
print(f'Total Missing Value: \n{df.isnull().sum()}')

Total Missing Value: 
text_emoji    0
Sarcasm       0
dtype: int64


<h1>Feature Extraction Using GloVe Method</h1>

In [24]:
class GloVe:
    # constructors
    def __init__(self, corpus, embedding_size, window_size=5):
        self.corpus = corpus
        self.embedding_size = embedding_size
        self.window_size = window_size
        self.word2id = {}
        self.id2word = {}
        self.word2count = {}
        self.vocab_size = 0
        self.co_matrix = None
        self.embedding = None
        self.bias = None

    def build_vocab_information(self):
        for sentence in self.corpus:
            for word in sentence.split():
                if word not in self.word2id:
                    self.word2id[word] = self.vocab_size
                    self.id2word[self.vocab_size] = word
                    self.vocab_size += 1
                self.word2count[word] = self.word2count.get(word, 0) + 1
        print(f'Vocab Size: {self.vocab_size}')

    def build_co_matrix(self):
        self.co_matrix = np.zeros((self.vocab_size, self.vocab_size))
        for sentence in self.corpus:
            sentence = sentence.split()
            for i, center_word in enumerate(sentence):
                for j in range(max(0, i - self.window_size), i):
                    context_word = sentence[j]
                    self.co_matrix[self.word2id[center_word]][self.word2id[context_word]] += 1
                    self.co_matrix[self.word2id[context_word]][self.word2id[center_word]] += 1

    def train(self, num_epochs=300, learning_rate=0.01):
        self.embedding = (np.random.rand(self.vocab_size, self.embedding_size) - 0.5) / self.embedding_size
        self.bias = np.zeros(self.vocab_size)
        count = 0
        for epoch in range(num_epochs):
            count += 1
            loss = 0
            for i in range(self.vocab_size):
                for j in range(self.vocab_size):
                    if self.co_matrix[i][j] != 0:
                        diff = (self.embedding[i] @ self.embedding[j]) + self.bias[i] + self.bias[j] - np.log(self.co_matrix[i][j])
                        
                        loss += diff ** 2
                        
                        grad_emb_i = diff * self.embedding[j]
                        grad_emb_j = diff * self.embedding[i]
                        
                        grad_bias_i = diff
                        grad_bias_j = diff

                        self.embedding[i] -= learning_rate * grad_emb_i
                        self.embedding[j] -= learning_rate * grad_emb_j
                        self.bias[i] -= learning_rate * grad_bias_i
                        self.bias[j] -= learning_rate * grad_bias_j
                        
            print(f'Difference Iterasi {count}: {diff}\n')
            print(f'Loss Iterasi {count}: {loss}\n')
            print(f'Gradient Embedding "i" Iterasi {count}: {grad_emb_i}\n')
            print(f'Gradient Embedding "j" Iterasi {count}: {grad_emb_j}\n')
            print(f'Embedding Iterasi {count}: {self.embedding}')

            print(f'Epoch {epoch + 1}/{num_epochs} - loss {round(loss, 3)}')
            
            if loss <= 0.001:
                print('Training Stopped, loss less than equal 0.001')
                break

    def get_all_embeddings(self):
        all_embeddings = {}
        for word, word_id in self.word2id.items():
            all_embeddings[word] = self.embedding[word_id]
        return all_embeddings
    
    def get_embeddings_by_id(self, id):
        return self.embedding[id]
    
    def get_embeddings(self, kata):
        word_embeddings = {}
        for word, word_id in self.word2id.items():
            if word == kata:
                word_embeddings[word] = self.embedding[word_id]
        return word_embeddings

    def embedding2word(self):
        embedding_to_word = {}
        for word, word_id in self.word2id.items():
            embedding_to_word[tuple(self.embedding[word_id])] = word
        return embedding_to_word
    
    def get_glove_embedding(self, sentence):
        sentence = sentence.split()
        embedding_vectors = []
        for word in sentence:
            if word in self.word2id:
                word_id = self.word2id[word]
                word_embedding = self.embedding[word_id]
                embedding_vectors.append(word_embedding)
        return embedding_vectors
    
    def show_comatrix(self):
        # df_co_matrix = pd.DataFrame(self.co_matrix, index=self.word2id.keys(), columns=self.word2id.keys())
        
        return self.co_matrix
    
    def generate_coomatrix_image(self):
        df_co_matrix = pd.DataFrame(self.co_matrix, index=self.word2id.keys(), columns=self.word2id.keys())

        print('Image is being processed...')
        dfi.export(df_co_matrix.head(10), "coocurrenceMatrixBab3.png", max_cols=10)
        
        return print('Coocurrence matrix has been successfully generated into an image')
    
    def show_word2id(self):
        return self.word2id
    
    def show_id2word(self):
        return self.id2word
    
    def show_word2count(self):
        return self.word2count

In [25]:
# Membangun model GloVe
embedding_size = 150

glove_model = GloVe(df['text_emoji'], embedding_size)

# Membangun vocabulary
glove_model.build_vocab_information()

# Membangun co-occurrence matrix
glove_model.build_co_matrix()

# Melatih model GloVe
glove_model.train()

Vocab Size: 6769
Difference Iterasi 1: 0.03420737052534675

Loss Iterasi 1: 47313.632457430314

Gradient Embedding "i" Iterasi 1: [-6.13837338e-05 -3.44128678e-05 -1.08514235e-04  9.91227062e-05
  1.02488833e-04  3.29198235e-05 -8.24988264e-05  8.72708589e-05
  4.05245792e-05 -9.96382817e-05  5.76318221e-05  6.72849640e-05
 -2.10001660e-05 -1.11122000e-04 -9.97018211e-05  9.98236538e-05
  5.05640845e-05  1.10834417e-04 -4.18988774e-05  8.83702572e-05
 -1.09601313e-04  1.02393634e-04  6.57099909e-05 -6.57659160e-05
 -7.61511493e-05 -5.63306468e-05  8.35719143e-05 -5.59238925e-05
 -7.04504899e-05 -2.19199439e-05 -1.19638570e-04 -7.22541855e-05
  7.58632520e-05  6.62741921e-05  7.21644489e-05 -8.86431311e-05
 -3.48778836e-06  3.65122310e-05 -1.17151553e-04 -7.58356756e-05
  6.01609495e-06 -6.30326224e-05  6.82988042e-05 -3.08753352e-05
 -5.00185553e-05  3.55862043e-05 -3.58599682e-06  2.87117166e-05
 -4.16207218e-05 -4.63253820e-05 -5.47994705e-05 -9.00408239e-05
  4.27822160e-05 -7.68466

In [26]:
# Menampilkan Dict word2id
word_2_id = glove_model.show_word2id()
word_2_id

{'kawan': 0,
 'sunda': 1,
 'bersatu': 2,
 'jemput': 3,
 'arteria': 4,
 'dahlan': 5,
 '😁': 6,
 'sidang': 7,
 'adat': 8,
 'daerah': 9,
 'menuntut': 10,
 'hukum': 11,
 'ga': 12,
 'tafsir': 13,
 'membanding': 14,
 'membandingkan': 15,
 'edy': 16,
 'menegakkan': 17,
 'divisi': 18,
 'humas': 19,
 'polri': 20,
 'bilang': 21,
 'palsu': 22,
 'mengeluarkan': 23,
 'membenarkan': 24,
 'plat': 25,
 'no': 26,
 'asli': 27,
 'urut': 28,
 'dada': 29,
 'deh': 30,
 'kuasa': 31,
 'mengerti': 32,
 'tsb': 33,
 'dengar': 34,
 'pendapat': 35,
 'gedung': 36,
 'dpr': 37,
 'memiliki': 38,
 'kekebalan': 39,
 'atur': 40,
 'konstitusi': 41,
 'n': 42,
 'uu': 43,
 'maaf': 44,
 'memaafkan': 45,
 'kubu': 46,
 'saran': 47,
 'dorong': 48,
 'masyarakat': 49,
 'jabar': 50,
 'menangkap': 51,
 'semoga': 52,
 'berpegang': 53,
 'komitmen': 54,
 'menenggelamkan': 55,
 'partai': 56,
 'demokrasi': 57,
 'indonesia': 58,
 'perjuangan': 59,
 'spt': 60,
 'sumbar': 61,
 'beruntung': 62,
 'kabar': 63,
 'e': 64,
 'hina': 65,
 'suku': 66

In [27]:
# Menampilkan Dict id2word
id_2_word = glove_model.show_id2word()
id_2_word

{0: 'kawan',
 1: 'sunda',
 2: 'bersatu',
 3: 'jemput',
 4: 'arteria',
 5: 'dahlan',
 6: '😁',
 7: 'sidang',
 8: 'adat',
 9: 'daerah',
 10: 'menuntut',
 11: 'hukum',
 12: 'ga',
 13: 'tafsir',
 14: 'membanding',
 15: 'membandingkan',
 16: 'edy',
 17: 'menegakkan',
 18: 'divisi',
 19: 'humas',
 20: 'polri',
 21: 'bilang',
 22: 'palsu',
 23: 'mengeluarkan',
 24: 'membenarkan',
 25: 'plat',
 26: 'no',
 27: 'asli',
 28: 'urut',
 29: 'dada',
 30: 'deh',
 31: 'kuasa',
 32: 'mengerti',
 33: 'tsb',
 34: 'dengar',
 35: 'pendapat',
 36: 'gedung',
 37: 'dpr',
 38: 'memiliki',
 39: 'kekebalan',
 40: 'atur',
 41: 'konstitusi',
 42: 'n',
 43: 'uu',
 44: 'maaf',
 45: 'memaafkan',
 46: 'kubu',
 47: 'saran',
 48: 'dorong',
 49: 'masyarakat',
 50: 'jabar',
 51: 'menangkap',
 52: 'semoga',
 53: 'berpegang',
 54: 'komitmen',
 55: 'menenggelamkan',
 56: 'partai',
 57: 'demokrasi',
 58: 'indonesia',
 59: 'perjuangan',
 60: 'spt',
 61: 'sumbar',
 62: 'beruntung',
 63: 'kabar',
 64: 'e',
 65: 'hina',
 66: 'suku'

In [28]:
# Menampilkan Dict word2count
word_2_count = glove_model.show_word2count()
word_2_count

{'kawan': 40,
 'sunda': 19,
 'bersatu': 14,
 'jemput': 7,
 'arteria': 35,
 'dahlan': 31,
 '😁': 492,
 'sidang': 4,
 'adat': 18,
 'daerah': 45,
 'menuntut': 6,
 'hukum': 55,
 'ga': 187,
 'tafsir': 2,
 'membanding': 1,
 'membandingkan': 12,
 'edy': 29,
 'menegakkan': 8,
 'divisi': 1,
 'humas': 1,
 'polri': 9,
 'bilang': 63,
 'palsu': 10,
 'mengeluarkan': 4,
 'membenarkan': 4,
 'plat': 4,
 'no': 16,
 'asli': 38,
 'urut': 1,
 'dada': 2,
 'deh': 32,
 'kuasa': 9,
 'mengerti': 10,
 'tsb': 19,
 'dengar': 12,
 'pendapat': 9,
 'gedung': 5,
 'dpr': 64,
 'memiliki': 53,
 'kekebalan': 1,
 'atur': 15,
 'konstitusi': 11,
 'n': 47,
 'uu': 16,
 'maaf': 67,
 'memaafkan': 5,
 'kubu': 38,
 'saran': 7,
 'dorong': 2,
 'masyarakat': 171,
 'jabar': 6,
 'menangkap': 3,
 'semoga': 207,
 'berpegang': 2,
 'komitmen': 22,
 'menenggelamkan': 3,
 'partai': 321,
 'demokrasi': 130,
 'indonesia': 475,
 'perjuangan': 123,
 'spt': 24,
 'sumbar': 1,
 'beruntung': 14,
 'kabar': 17,
 'e': 23,
 'hina': 13,
 'suku': 10,
 'peja

In [None]:
# Menampilkan hasil dari Cooccurence Matrix
co_matrix = glove_model.show_comatrix()

co_matrix

In [None]:
# # Generate image Matrix Coocurrence
# generateCooMatrix = glove_model.generate_coomatrix_image()

# generateCooMatrix

In [29]:
word_embeddings = glove_model.get_embeddings('😁')

print(word_embeddings)

{'😁': array([ 1.72645141e-01,  1.71961818e-01, -3.73728948e-01, -9.87215801e-02,
       -1.65630300e-01,  3.53403565e-02, -1.13989931e-01, -4.13518879e-04,
        2.03586811e-02,  2.08648730e-01, -2.28418170e-01,  2.21924810e-01,
        1.27003214e-01, -2.24139276e-01,  2.12033890e-01,  2.50670914e-01,
       -2.74668844e-01,  3.65106566e-02,  4.63040180e-01, -7.15745470e-02,
       -9.38696518e-02,  6.13933602e-02,  1.21242208e-02, -3.39973347e-01,
       -1.77595772e-01,  1.72527504e-01, -1.83191871e-01,  3.81987463e-01,
        8.89362484e-02, -7.24733172e-02, -1.83640962e-01, -1.94120072e-01,
       -9.94743823e-02,  2.23597988e-01, -9.49098061e-02, -1.98927905e-01,
       -1.79419053e-01,  1.35371998e-01, -6.73529118e-02, -8.57824847e-02,
        1.35596207e-01, -8.98471000e-04, -2.80628501e-01,  3.38253533e-02,
       -9.55979106e-02,  9.48067895e-02,  3.42090082e-02, -1.14943440e-01,
        8.82328337e-02, -1.61705073e-01, -5.45124060e-02, -3.18995974e-02,
        2.97581927e

In [30]:
embedding_by_id = glove_model.get_embeddings_by_id(6)

embedding_by_id

array([ 1.72645141e-01,  1.71961818e-01, -3.73728948e-01, -9.87215801e-02,
       -1.65630300e-01,  3.53403565e-02, -1.13989931e-01, -4.13518879e-04,
        2.03586811e-02,  2.08648730e-01, -2.28418170e-01,  2.21924810e-01,
        1.27003214e-01, -2.24139276e-01,  2.12033890e-01,  2.50670914e-01,
       -2.74668844e-01,  3.65106566e-02,  4.63040180e-01, -7.15745470e-02,
       -9.38696518e-02,  6.13933602e-02,  1.21242208e-02, -3.39973347e-01,
       -1.77595772e-01,  1.72527504e-01, -1.83191871e-01,  3.81987463e-01,
        8.89362484e-02, -7.24733172e-02, -1.83640962e-01, -1.94120072e-01,
       -9.94743823e-02,  2.23597988e-01, -9.49098061e-02, -1.98927905e-01,
       -1.79419053e-01,  1.35371998e-01, -6.73529118e-02, -8.57824847e-02,
        1.35596207e-01, -8.98471000e-04, -2.80628501e-01,  3.38253533e-02,
       -9.55979106e-02,  9.48067895e-02,  3.42090082e-02, -1.14943440e-01,
        8.82328337e-02, -1.61705073e-01, -5.45124060e-02, -3.18995974e-02,
        2.97581927e-02, -

In [31]:
all_embedding = glove_model.get_all_embeddings()

all_embedding['kawan']

array([ 0.13866269,  0.21890234,  0.09218679,  0.23154498,  0.05260381,
       -0.18254332,  0.0148109 , -0.24716445, -0.09289539, -0.05341877,
       -0.03588736, -0.11396067, -0.14686196, -0.11092585,  0.17777542,
       -0.10708508,  0.07685054,  0.08196429, -0.06430258, -0.04463621,
       -0.10190072, -0.09209554,  0.05473403,  0.03048467,  0.13933763,
        0.00675469,  0.17380573, -0.01266973,  0.06970052, -0.18607045,
       -0.16362255, -0.27978825,  0.15206938,  0.23410425, -0.15367573,
        0.18008264,  0.1425858 ,  0.14058182,  0.08179359,  0.02329865,
       -0.07711526,  0.22278682, -0.03484566,  0.31960768,  0.1649335 ,
        0.12605916,  0.14018289, -0.15323181, -0.14750196, -0.05776078,
       -0.00943602, -0.20468867,  0.11759507,  0.0376124 ,  0.20652019,
       -0.07633969, -0.17153993,  0.06926615,  0.02995884,  0.09424945,
        0.1868265 ,  0.09858819, -0.10814667,  0.10034625,  0.06492498,
        0.00934756,  0.18294062, -0.2825197 ,  0.0352203 , -0.06

In [32]:
# # Menyimpan embedding ke file teks

# embeddings = glove_model.get_all_embeddings()

# output_file = "Misc/Embeddings/embeddings150D_300E.txt"
# with open(output_file, "w", encoding="utf-8") as file:
#     for word, embedding in embeddings.items():
#         embedding_line = "{} {}\n".format(word, " ".join(str(value) for value in embedding))
#         file.write(embedding_line)

# print("Embedding telah berhasil disimpan ke: {}".format(output_file))

Embedding telah berhasil disimpan ke: Misc/Embeddings/embeddings150D_300E.txt


<h1>Create New Feature for Word Embeddings</h1>

In [None]:
# df_ready['embedding'] = df_ready['text_emoji'].apply(lambda x: glove_model.get_glove_embedding(x))

In [None]:
# df_ready.head()

<h2>Fungsi untuk merubah token menjadi representasi word embedding</h2>

In [None]:
def replace_token_with_word_embedding(text):
    embeddings = []
    for word in text.split():
        if word in all_embedding:
            embeddings.append(all_embedding[word])
    
    return np.array(embeddings)

In [None]:
kalimat = df['text_emoji'][0]
output = []

for sentence in kalimat.split():
    if sentence in all_embedding:
        output.append(all_embedding[sentence])
    
print(len(np.array(output)))

In [None]:
df['vector'] = df['text_emoji'].apply(replace_token_with_word_embedding)

In [None]:
df.head()

In [None]:
# df_ready['text_emoji'].duplicated().sum()

In [None]:
# df_ready.head()

In [None]:
# df_ready['text_emoji'][0]

In [None]:
# embedding2word = glove_model.embedding2word()

# embedding2word[tuple(df_ready['text_emoji'][0][0])]

<h1>Implement SMOTE</h1>

In [16]:
df['Sarcasm'] = df['Sarcasm'].replace('Positif', 1)
df['Sarcasm'] = df['Sarcasm'].replace('Negatif', 0)

In [17]:
X = np.array(df['text_emoji'])
y = np.array(df['Sarcasm'])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Cek class minoritas
print(f'Total Sample for each class:\n{y_train.value_counts()}')

<p>Terlihat Bahwa class minoritas adalah class dengan nilai 1(Positif)</p>

In [None]:
y_train

In [None]:
np.where(y_train == 1)[0]

In [None]:
X_train[np.where(y_train == 1)[0]]

In [19]:
max_len = 39
len_voc = 40000

In [None]:
df_train_min = X_train[np.where(y_train == 1)[0]]
df_train_maj = X_train[np.where(y_train == 0)[0]]

df_test = X_test

In [None]:
df_train_min

In [None]:
new_arrays_train_min = []

for i in df_train_min:
    new_array = np.delete(i, -1)
    new_arrays_train_min.append(new_array)

df_train_min_text = np.concatenate(new_arrays_train_min)

In [None]:
df_train_min_text

In [None]:
new_arrays_train_maj = []

for i in df_train_maj:
    new_array = np.delete(i, -1)
    new_arrays_train_maj.append(new_array)

df_train_maj_text = np.concatenate(new_arrays_train_maj)

In [None]:
new_arrays_test = []

for i in df_test:
    new_array = np.delete(i, -1)
    new_arrays_test.append(new_array)

df_test_text = np.concatenate(new_arrays_test)

In [None]:
len(df_train_min_text)

In [None]:
len(df_train_maj_text)

In [None]:
len(df_test_text)

In [None]:
# df_ready_min = pd.concat([df_train_min, df_test_min])
# df_ready_maj = pd.concat([df_train_maj, df_test_maj])

In [20]:
# Tokenizing
def make_tokenizer(texts, len_voc):
    from keras.preprocessing.text import Tokenizer
    t = Tokenizer(num_words=len_voc)
    t.fit_on_texts(texts)
    
    return t

<h2>Tokenizer</h2>

In [21]:
tokenizer = make_tokenizer(X_train, len_voc)

In [None]:
# tokenizer_test = make_tokenizer(X_test, len_voc)

In [None]:
tokenizer_train_min = make_tokenizer(df_train_min_text, len_voc)
tokenizer_train_maj = make_tokenizer(df_train_maj_text, len_voc)
tokenizer_test = make_tokenizer(df_test_text, len_voc)

In [22]:
word_index = tokenizer.word_index

<h2>Sequence</h2>

In [23]:
X_train_Sequence = tokenizer.texts_to_sequences(X_train)

In [24]:
X_test_Sequence = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_min = tokenizer_train_min.texts_to_sequences(df_train_min_text)
X_train_major = tokenizer_train_maj.texts_to_sequences(df_train_maj_text)
X_test = tokenizer_test.texts_to_sequences(df_test_text)

# X_train_Sentiment = df_train['Sentimen_Text']
# X_test_Sentiment = df_test['Sentimen_Text']

In [None]:
X_train_min

In [None]:
X_train_major

<h2>Padding</h2>

In [25]:
from keras.utils import pad_sequences

X_train_Padded = pad_sequences(X_train_Sequence, maxlen=max_len, padding='post', truncating='post')

In [26]:
X_test_Padded = pad_sequences(X_test_Sequence, maxlen=max_len, padding='post', truncating='post')

In [None]:
from keras.utils import pad_sequences

X_train_min = pad_sequences(X_train_min, maxlen=max_len, padding='post', truncating='post')
X_train_major = pad_sequences(X_train_major, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

In [None]:
X_train_min

<h2>Stacking Label Minority</h2>

In [None]:
y_train_min = y_train[np.where(y_train == 1)[0]]

In [None]:
y_train_min.shape

<h2>Stacking Label Majority</h2>

In [None]:
y_train_maj = y_train[np.where(y_train == 0)[0]]

In [None]:
y_train_maj.shape

<h2>Stacking Sentiment Minority</h2>

In [None]:
X_train_sentiment_min = []
for i in range(len(df_train_min)):
    X_train_sentiment_min.append(df_train_min[i][1])

X_train_sentiment_min = np.array(X_train_sentiment_min)

In [None]:
X_train_sentiment_min = X_train_sentiment_min.reshape(X_train_sentiment_min.shape[0], 1)

In [None]:
X_train_sentiment_min

In [None]:
X_train_sentiment_min.shape

<h2>Stacking Sentiment Majority</h2>

In [None]:
X_train_sentiment_maj = []
for i in range(len(df_train_maj)):
    X_train_sentiment_maj.append(df_train_maj[i][1])

X_train_sentiment_maj = np.array(X_train_sentiment_maj)

In [None]:
X_train_sentiment_maj = X_train_sentiment_maj.reshape(X_train_sentiment_maj.shape[0], 1)

In [None]:
X_train_sentiment_maj

In [None]:
X_train_sentiment_maj.shape

<h2>Stacking Sentiment Testing</h2>

In [None]:
X_test_sentiment = []
for i in range(len(df_test)):
    X_test_sentiment.append(df_test[i][1])

X_test_sentiment = np.array(X_test_sentiment)

In [None]:
X_test_sentiment = X_test_sentiment.reshape(X_test_sentiment.shape[0], 1)

In [None]:
X_test_sentiment

In [None]:
X_test_sentiment.shape

In [27]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

def load_embedding(file):
    embeddings_index = dict(get_coefs(*i.split(" ")) for i in open(file, encoding='utf-8'))
    
    return embeddings_index

In [28]:
def make_embedding_matrix(embedding, tokenizer, len_voc):
    all_embs = np.stack(embedding.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = tokenizer.word_index
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len_voc, embed_size))
    # embedding_matrix = np.zeros((len_voc, embed_size))
    
    for word, i in word_index.items():
        if i >= len_voc:
            continue
        embedding_vector = embedding.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [29]:
glove_50D = load_embedding('Misc/Embeddings/embeddings50D_300E.txt')
glove_100D = load_embedding('Misc/Embeddings/embeddings100D_300E.txt')
glove_150D = load_embedding('Misc/Embeddings/embeddings150D_300E.txt')

In [30]:
embed_matrix_50D = make_embedding_matrix(glove_50D, tokenizer, len_voc)
embed_matrix_100D = make_embedding_matrix(glove_100D, tokenizer, len_voc)
embed_matrix_150D = make_embedding_matrix(glove_150D, tokenizer, len_voc)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [31]:
X_train_embedding = embed_matrix_100D[X_train_Padded]

In [32]:
# embed_matrix_test = make_embedding_matrix(glove, tokenizer_test, len_voc)

X_test_embedding = embed_matrix_100D[X_test_Padded]

In [33]:
train_size, max_len, embed_size = X_train_embedding.shape
X_train_embedding_r = X_train_embedding.reshape(train_size, max_len*embed_size)

test_size, max_len, embed_size = X_test_embedding.shape
X_test_embedding_r = X_test_embedding.reshape(test_size, max_len*embed_size)

In [33]:
smote = SMOTE()

X_oversampled, y_oversampled = smote.fit_resample(X_train_embedding_r, y_train)

In [34]:
unique_values, value_counts = np.unique(y_oversampled, return_counts=True)

for value, counts in zip(unique_values, value_counts):
    print(f'{value}: {counts}')

0: 1804
1: 1804


In [None]:
embed_mat_train_min = make_embedding_matrix(glove, tokenizer_train_min, len_voc)
embed_mat_train_maj = make_embedding_matrix(glove, tokenizer_train_maj, len_voc)

embed_mat_test = make_embedding_matrix(glove, tokenizer_test, len_voc)

In [None]:
embed_mat_train_min.shape

In [None]:
X_train_emb_minority = embed_mat_train_min[X_train_min]
X_train_emb_majority = embed_mat_train_maj[X_train_major]

X_test_emb = embed_mat_test[X_test]

In [None]:
X_train_emb_minority

In [None]:
train_size_min, max_len_min, embed_size_min = X_train_emb_minority.shape
X_train_emb_r_min = X_train_emb_minority.reshape(train_size_min, max_len*embed_size_min)

train_size_maj, max_len_maj, embed_size_maj = X_train_emb_majority.shape
X_train_emb_r_maj = X_train_emb_majority.reshape(train_size_maj, max_len*embed_size_maj)

In [None]:
X_train_emb_r_min.shape

In [None]:
X_train_emb_r_maj.shape

In [None]:
test_size, max_len, embed_size = X_test_emb.shape
X_test_emb_r = X_test_emb.reshape(test_size, max_len*embed_size)

In [None]:
X_test_emb_r.shape

<h2>Stacking All Data Minority & Majority</h2>

In [None]:
stack_train_minority = np.hstack((X_train_emb_r_min, X_train_sentiment_min))
stack_train_majority = np.hstack((X_train_emb_r_maj, X_train_sentiment_maj))

stack_testing = np.hstack((X_test_emb_r, X_test_sentiment))

In [None]:
print(f'Shape stack minority: {stack_train_minority.shape}')
print(f'Shape stack majority: {stack_train_majority.shape}')

In [None]:
# test_size, max_len, embed_size = X_test_emb.shape
# X_test_emb_r = X_test_emb.reshape(test_size, max_len*embed_size)

In [None]:
# X = df_ready['embedding']
# X_sentiment = np.array(df_ready['Sentimen_Text']).reshape(-1, 1)
# y = np.array(df_ready['Sarcasm'])

In [None]:
# from sklearn.model_selection import train_test_split
# # Bagi dataset menjadi train set dan test set
# X_train, X_test, X_sentiment_train, X_sentiment_test, y_train, y_test = train_test_split(X, X_sentiment, y, test_size=0.4, random_state=42)

In [None]:
# X_minority = X_train[y_train == 1]
# X_sentiment_minority = X_sentiment_train[y_train == 1]
# y_minority = y_train[y_train == 1]

In [None]:
# len(X_minority)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

def euclidean_distance(X1, X2):
    return euclidean_distances(X1, X2)

def get_neighbors(distances, k):
    return np.argsort(distances)[:, 1:k+1]

def generate_synthetic_samples(X_minority, neighbors, k_neighbors):
    synthetic_samples = []

    for i in range(len(X_minority)):
        minority_neighbors = neighbors[i][neighbors[i] < len(X_minority)]
        nn = np.random.choice(minority_neighbors, size=min(k_neighbors, len(minority_neighbors)), replace=False)
        alpha = np.random.uniform(0, 1)
        synthetic_sample = X_minority[i] + alpha * (X_minority[nn] - X_minority[i])
        
        if synthetic_sample.size > 1:
            synthetic_sample = synthetic_sample[:1]

        print(synthetic_sample)
        
        synthetic_samples.append(synthetic_sample)

    data_synthetic = np.vstack(synthetic_samples)
    for i in range(len(synthetic_samples)):
        print(synthetic_samples[i].shape)
    return data_synthetic

def smote(X_minority, X_majority, y_minority, k):
    minority_size = X_minority.shape[0]
    majority_size = X_majority.shape[0]  # Asumsi jumlah sampel mayoritas sama dengan jumlah sampel minoritas
    
    # Menghitung jumlah tetangga terdekat berdasarkan perbandingan antara jumlah sampel minoritas dan mayoritas
    k_neighbors = int((majority_size / minority_size) * k)
    
    if k_neighbors > k:
        k_neighbors = k

    # Menghitung jarak antara sampel minoritas dan mayoritas
    distances = euclidean_distance(X_minority, X_majority)
    print('Distance\n')
    print(f'{distances}\n')
    
    # Mendapatkan tetangga terdekat untuk setiap sampel minoritas
    neighbors = get_neighbors(distances, k_neighbors)
    print('Neighbors\n')
    print(f'{neighbors}\n')
    print(f'Length : {neighbors.shape[0]}\n')
    
    # Menghasilkan sampel sintetis
    print('Synthetic Data\n')
    synthetic_samples = generate_synthetic_samples(X_minority, neighbors, k_neighbors)
    synthetic_labels = np.ones(len(synthetic_samples))
    
    # Menggabungkan sampel minoritas asli dengan sampel sintetis
    X_oversampled = np.vstack((X_minority, synthetic_samples))
    y_oversampled = np.hstack((y_minority, synthetic_labels))
    
    return X_oversampled, y_oversampled

In [None]:
total_size = train_size_maj + train_size_min + test_size

In [None]:
# Menjalankan SMOTE pada data stack
X_minority = stack_train_minority
y_minority = y_train_min
X_majority = stack_train_minority
y_majority = y_train_maj

X_oversampled, y_oversampled = smote(X_minority, X_majority, y_minority, k=5)

# Memisahkan kembali data menjadi train dan test set
X_train_oversampled = X_oversampled[:total_size]
X_test_oversampled = X_oversampled[total_size:]

y_train_oversampled = y_oversampled[:total_size]
y_test_oversampled = y_oversampled[total_size:]

In [None]:
final_stack = np.vstack((stack_train_majority, X_oversampled))

In [None]:
len(final_stack)

In [None]:
final_stack = np.delete(final_stack, np.s_[4024::], axis=0)

In [None]:
final_stack.shape

In [None]:
final_stack_label = np.hstack((y_train_maj, y_oversampled))

In [None]:
final_stack_label = np.delete(final_stack_label, np.s_[4024::], axis=0)

In [None]:
print(f'Final Shape Data: {final_stack.shape}\n')

In [None]:
print(f'Final Shape Label: {final_stack_label.shape}\n')

In [None]:
# Menghitung Jumlah Akhir Sample pada Class 0 dan 1
unique, counts = np.unique(final_stack_label, return_counts=True)
result = np.asarray((unique, counts)).T.astype(int)

print(result[0][1])

In [None]:
import matplotlib.pyplot as plt

x = ['Sebelum', 'Sesudah']
y = [len(X_train_emb_r_min) + len(X_train_emb_r_maj), len(final_stack)]


plt.bar(x, y, color=[(0.38, 0.62, 0.79), (0.98, 0.64, 0.33)], edgecolor='black', linewidth=1.5)

for i in range(len(x)):
    plt.text(x[i], y[i], str(y[i]), ha='center', va='bottom')

plt.ylabel('Jumlah Sample')
plt.title('Perbandingan Data Training Setelah Diimplementasi SMOTE')

plt.show()

In [None]:
x = ['0', '1']
y = [1794, 1718]


plt.bar(x, y, color=[(0.38, 0.62, 0.79), (0.98, 0.64, 0.33)], edgecolor='black', linewidth=1.5)

for i in range(len(x)):
    plt.text(x[i], y[i], str(y[i]), ha='center', va='bottom')

plt.ylabel('Jumlah Sample')
plt.title('Jumlah Sample Masing-Masing Kelas Pada Data Training Sebelum Di Implementasi SMOTE')

plt.show()

In [None]:
x = ['0', '1']
y = [result[0][1], result[1][1]]


plt.bar(x, y, color=[(0.38, 0.62, 0.79), (0.98, 0.64, 0.33)], edgecolor='black', linewidth=1.5)

for i in range(len(x)):
    plt.text(x[i], y[i], str(y[i]), ha='center', va='bottom')

plt.ylabel('Jumlah Sample')
plt.title('Jumlah Sample Masing-Masing Kelas Setelah Di Implementasi SMOTE')

plt.show()

In [None]:
# X_embedding = np.array(df_ready['embedding'][:2]).reshape(-1, 1)
# sentiment_values = np.array(df_ready['Sentimen_Text'][:2]).reshape(-1, 1)

# num_rows = X_embedding.shape[0]

# X_sentiment = np.empty((num_rows, 1), dtype=np.str_)

# for i in range(num_rows):
#     X_sentiment[i] = sentiment_values[i % sentiment_values.shape[0]]

# X = np.hstack((X_embedding, X_sentiment))

# X

In [None]:
# y = df_ready['Sarcasm']

# y

In [None]:
# from sklearn.neighbors import NearestNeighbors

# def get_k_nearest_neighbors(X, sample, k):
#     # Mencari tetangga terdekat menggunakan k-NN
#     knn = NearestNeighbors(n_neighbors=k+1)
#     knn.fit(X)
    
#     # Mengembalikan indeks tetangga terdekat untuk sampel
#     _, indices = knn.kneighbors([sample])
#     indices_1d = indices.flatten()  # Mengubah menjadi 1D array
    
#     return indices_1d[1:]  # Menghilangkan indeks diri sendiri

In [None]:
# import pandas as pd
# import numpy as np
# import math

# def SMOTE(df, k, oversampling_ratio):
#     # Mendapatkan array numpy dari fitur embedding, sentiment, dan label
#     embeddings = np.vstack(df['embedding'].to_numpy())
#     labels = df['Sarcasm'].to_numpy()
    
#     # Menginisialisasi array untuk menyimpan sampel sintetis yang dihasilkan
#     synthetic_samples = []
    
#     # Mencari indeks sampel minoritas dan mayoritas
#     minority_indices = np.where(labels == 1)[0]
#     majority_indices = np.where(labels == 0)[0]
    
#     # Menghitung jumlah sampel sintetis yang akan dibuat untuk kelas minoritas
#     num_minority_samples = len(minority_indices)
#     num_majority_samples = len(majority_indices)
    
#     num_synthetic_samples = min(num_majority_samples - num_minority_samples, math.ceil(num_minority_samples * oversampling_ratio))
    
#     if num_synthetic_samples <= 0:
#         return pd.DataFrame()  # Return empty DataFrame jika tidak ada sampel sintetis yang perlu dibuat
    
#     # Looping melalui setiap sampel minoritas
#     for i in minority_indices:
#         sample = embeddings[i]
        
#         # Mencari tetangga terdekat untuk sampel minoritas
#         neighbors = get_k_nearest_neighbors(embeddings, sample, k)
        
#         # Menghasilkan sampel sintetis
#         for j in range(num_synthetic_samples):
#             # Memilih satu tetangga acak
#             neighbor_index = np.random.choice(neighbors)
            
#             if neighbor_index >= len(embeddings):
#                 continue  # Skip jika indeks tetangga melebihi ukuran data yang valid
            
#             neighbor = embeddings[neighbor_index]
            
#             # Menghitung selisih antara sampel dan tetangga
#             diff = neighbor - sample
            
#             # Menghasilkan sampel sintetis dengan proporsi acak
#             proportion = np.random.uniform(0, 1, size=1)
#             synthetic_sample = sample + proportion * diff
            
#             # Menambahkan sampel sintetis ke array
#             synthetic_samples.append(synthetic_sample)
    
#     # Mengubah array numpy ke DataFrame
#     embedding_array = np.vstack(synthetic_samples)
#     embedding_df = pd.DataFrame(embedding_array, columns=['embedding'] * embedding_array.shape[1])
#     sarcasm_df = pd.DataFrame(np.zeros((embedding_array.shape[0], 1)), columns=['Sarcasm'])
#     synthetic_df = pd.concat([embedding_df, sarcasm_df], axis=1)
    
#     # Menambahkan label kelas minoritas pada data sintetis
#     synthetic_df['Sarcasm'] = 1

#     return synthetic_df

In [None]:
# oversampled_df = SMOTE(df_ready, k=5, oversampling_ratio=1.0)

<h1>Training Bi-LSTM<h1>

In [44]:
# Bi-LSTM Algorithm

import numpy as np
import pickle as pkl

# activation function sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Turunan activation function sigmoid
def sigmoid_derivative(x):
    return x * (1 - x)

# activation function tanh
def tanh(x):
    return np.tanh(x)

# Turunan activation function tanh
def tanh_derivative(x):
    return 1 - x**2

class BiLSTM:
    # Constructor
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Parameter untuk Input Gate Menggunakan Metode Xavier
        self.W_i = np.random.uniform(-1, 1, (hidden_size, input_size)) * np.sqrt(1 / input_size)
        self.U_i = np.random.uniform(-1, 1, (hidden_size, hidden_size)) * np.sqrt(1 / hidden_size)
        self.b_i = np.zeros((hidden_size, 1))
        
        # Parameter untuk Forget Gate Menggunakan Metode Xavier
        self.W_f = np.random.uniform(-1, 1, (hidden_size, input_size)) * np.sqrt(1 / input_size)
        self.U_f = np.random.uniform(-1, 1, (hidden_size, hidden_size)) * np.sqrt(1 / hidden_size)
        self.b_f = np.zeros((hidden_size, 1))
        
        # Parameter untuk Output Gate Menggunakan Metode Xavier
        self.W_o = np.random.uniform(-1, 1, (hidden_size, input_size)) * np.sqrt(1 / input_size)
        self.U_o = np.random.uniform(-1, 1, (hidden_size, hidden_size)) * np.sqrt(1 / hidden_size)
        self.b_o = np.zeros((hidden_size, 1))
        
        # Parameter untuk Cell State Menggunakan Metode Xavier
        self.W_c = np.random.uniform(-1, 1, (hidden_size, input_size)) * np.sqrt(1 / input_size)
        self.U_c = np.random.uniform(-1, 1, (hidden_size, hidden_size)) * np.sqrt(1 / hidden_size)
        self.b_c = np.zeros((hidden_size, 1))
        
        # Parameter untuk output layer Menggunakan Metode Xavier
        self.W_y = np.random.uniform(-1, 1, (output_size, hidden_size*2)) * np.sqrt(1 / (hidden_size*2))
        self.b_y = np.zeros((output_size, 1))

        # Parameter m dan v pada Adam Optimizer untuk setiap parameter
        self.mW_i, self.vW_i = np.zeros_like(self.W_i), np.zeros_like(self.W_i)
        self.mU_i, self.vU_i = np.zeros_like(self.U_i), np.zeros_like(self.U_i)
        self.mb_i, self.vb_i = np.zeros_like(self.b_i), np.zeros_like(self.b_i)
        
        self.mW_f, self.vW_f = np.zeros_like(self.W_f), np.zeros_like(self.W_f)
        self.mU_f, self.vU_f = np.zeros_like(self.U_f), np.zeros_like(self.U_f)
        self.mb_f, self.vb_f = np.zeros_like(self.b_f), np.zeros_like(self.b_f)

        self.mW_o, self.vW_o = np.zeros_like(self.W_o), np.zeros_like(self.W_o)
        self.mU_o, self.vU_o = np.zeros_like(self.U_o), np.zeros_like(self.U_o)
        self.mb_o, self.vb_o = np.zeros_like(self.b_o), np.zeros_like(self.b_o)

        self.mW_c, self.vW_c = np.zeros_like(self.W_c), np.zeros_like(self.W_c)
        self.mU_c, self.vU_c = np.zeros_like(self.U_c), np.zeros_like(self.U_c)
        self.mb_c, self.vb_c = np.zeros_like(self.b_c), np.zeros_like(self.b_c)

        self.mW_y, self.vW_y = np.zeros_like(self.W_y), np.zeros_like(self.W_y)
        self.mb_y, self.vb_y = np.zeros_like(self.b_y), np.zeros_like(self.b_y)

        self.count = 0
        
    def forward_backward(self, x):
        T = len(x)
        # print(T)
        self.count += 1

        self.h_forward = np.zeros((T + 1, self.hidden_size))
        self.c_forward = np.zeros((T + 1, self.hidden_size))
        self.h_backward = np.zeros((T + 1, self.hidden_size))
        self.c_backward = np.zeros((T + 1, self.hidden_size))
        self.gates_forward = np.zeros((T, self.hidden_size*4, self.hidden_size))
        self.gates_backward = np.zeros((T, self.hidden_size*4, self.hidden_size))
        self.outputs = np.zeros((T, self.output_size))
        
        for t in range(T):
            # Forward
            self.gates_forward[t] = np.concatenate([
                sigmoid(self.W_i @ x[t] + self.U_i @ self.h_forward[t] + self.b_i),
                sigmoid(self.W_f @ x[t] + self.U_f @ self.h_forward[t] + self.b_f),
                sigmoid(self.W_o @ x[t] + self.U_o @ self.h_forward[t] + self.b_o),
                tanh(self.W_c @ x[t] + self.U_c @ self.h_forward[t] + self.b_c)
            ])
        
            # print(f'Forget Gate Forward Iterasi {self.count}: {sigmoid(self.W_f @ x[t] + self.U_f @ self.h_forward[t] + self.b_f)}\n')
            # print(f'Input Gate Forward Iterasi {self.count}: {sigmoid(self.W_i @ x[t] + self.U_i @ self.h_forward[t] + self.b_i)}\n')
            # print(f'Output Gate Forward Iterasi {self.count}: {sigmoid(self.W_o @ x[t] + self.U_o @ self.h_forward[t] + self.b_o)}\n')
            self.c_forward[t + 1] = self.gates_forward[t, 1] * self.c_forward[t] + self.gates_forward[t, 0] * self.gates_forward[t, 3]
            self.h_forward[t + 1] = self.gates_forward[t, 2] * tanh(self.c_forward[t + 1])

            # print(f'Cell State Forward Iterasi {self.count}: {self.gates_forward[t, 1] * self.c_forward[t] + self.gates_forward[t, 0] * self.gates_forward[t, 3]}\n')
            # print(f'Hidden State Forward Iterasi {self.count}: {self.gates_forward[t, 2] * tanh(self.c_forward[t + 1])}\n')

            # backward
            t_backward = T - t - 1
            self.gates_backward[t_backward] = np.concatenate([
                sigmoid(self.W_i @ x[t_backward] + self.U_i @ self.h_backward[t_backward] + self.b_i),
                sigmoid(self.W_f @ x[t_backward] + self.U_f @ self.h_backward[t_backward] + self.b_f),
                sigmoid(self.W_o @ x[t_backward] + self.U_o @ self.h_backward[t_backward] + self.b_o),
                tanh(self.W_c @ x[t_backward] + self.U_c @ self.h_backward[t_backward] + self.b_c)
            ])
            
            # print(f'Forget Gate Backward Iterasi {self.count}: {sigmoid(self.W_f @ x[t_backward] + self.U_f @ self.h_backward[t_backward] + self.b_f)}\n')
            # print(f'Input Gate Backward Iterasi {self.count}: {sigmoid(self.W_i @ x[t_backward] + self.U_i @ self.h_backward[t_backward] + self.b_i)}\n')
            # print(f'Output Gate Backward Iterasi {self.count}: {sigmoid(self.W_o @ x[t_backward] + self.U_o @ self.h_backward[t_backward] + self.b_o)}\n')
            self.c_backward[t_backward + 1] = self.gates_backward[t_backward, 1] * self.c_backward[t_backward] + self.gates_backward[t_backward, 0] * self.gates_backward[t_backward, 3]
            self.h_backward[t_backward + 1] = self.gates_backward[t_backward, 2] * tanh(self.c_backward[t_backward + 1])

            # print(f'Cell State Backward Iterasi {self.count}: {self.gates_backward[t_backward, 1] * self.c_backward[t_backward] + self.gates_backward[t_backward, 0] * self.gates_backward[t_backward, 3]}\n')
            # print(f'Hidden State Backward Iterasi {self.count}: {self.gates_backward[t_backward, 2] * tanh(self.c_backward[t_backward + 1])}\n')

            # Merged Forward and Backward LSTM
            merged_output = np.concatenate([self.h_forward[t + 1], self.h_backward[t_backward + 1]])
            # h_reshaped = np.reshape(merged_output, (self.hidden_size*2,))
            # print(f'Final Output Reshape: {h_reshaped}\n')
            sigmoids = sigmoid(self.W_y @ merged_output + self.b_y)
            # print(f'Sigmoids: {sigmoids.shape}\n')
            self.outputs[t] = sigmoids[0]
        # print(f'Merged Output: {merged_output}\n')
        # print(f'final Outputs{self.outputs}\n')
        
        return self.outputs
    
    def backPropagation(self, x, y, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
        T = len(x)
        dW_i, dU_i, db_i = np.zeros_like(self.W_i), np.zeros_like(self.U_i), np.zeros_like(self.b_i)
        dW_f, dU_f, db_f = np.zeros_like(self.W_f), np.zeros_like(self.U_f), np.zeros_like(self.b_f)
        dW_o, dU_o, db_o = np.zeros_like(self.W_o), np.zeros_like(self.U_o), np.zeros_like(self.b_o)
        dW_c, dU_c, db_c = np.zeros_like(self.W_c), np.zeros_like(self.U_c), np.zeros_like(self.b_c)
        dW_y, db_y = np.zeros_like(self.W_y), np.zeros_like(self.b_y)
        dc_next = np.zeros((1, self.hidden_size))
        dh_next = np.zeros((1, self.hidden_size))
        
        for t in reversed(range(T)):
            dy = self.outputs[t]
            # print(f'Outputs: {dy}\n')
            # print(f'{y[t]}\n')
            dy[y[t]] -= 1
            # print(f'Outputs minus 1{dy}\n')
            dy_transpose = dy.reshape(1, -1).T
            # print(f'Shape dy_transpose: {dy_transpose.shape}\n')
            # print(f'Shape self.W_y.T : {self.W_y.shape}\n')
            dh_forward = self.W_y.T @ dy_transpose + dh_next
            # print(f'Shape dh_forward: {dh_forward.shape}\n')
            dc_forward = dh_forward * self.gates_forward[t, 2] * tanh_derivative(tanh(self.c_forward[t + 1])) + dc_next
            dh_backward = self.W_y.T @ dy_transpose + dh_next
            dc_backward = dh_backward * self.gates_backward[t, 2] * tanh_derivative(tanh(self.c_backward[t + 1])) + dc_next
            
            # print(f'Shape dy_transpose: {dy_transpose.shape}\n')
            # print(f'Shape dW_y: {dW_y.shape}\n')
            # print(f'Shape self.h_forward: {self.h_forward[t + 1].shape}\n')
            # dW_y += dy_transpose @ np.expand_dims(self.h_forward[t + 1], axis=0) + dy_transpose @ np.expand_dims(self.h_backward[t + 1], axis=0) # Salah
            dW_y += dy_transpose @ np.expand_dims(np.concatenate([self.h_forward[t + 1], self.h_backward[t + 1][:self.hidden_size]]), axis=0) # Benar
            db_y += dy_transpose
            
            # print(f'dc_forward: {dc_forward.shape}, self.gates_forward[t, 3]: {self.gates_forward[t, 3].shape}\n')
            dg_forward = dc_forward * self.gates_forward[t, 2] * tanh_derivative(self.gates_forward[t, 3])
            dg_backward = dc_backward * self.gates_backward[t, 3]

            dgates_forward = self.gates_forward[t]
            # print(f'Shape dgates_forward[3]: {dgates_forward.shape}\n')
            # dg_forward_expanded = np.expand_dims(dg_forward, axis=0)
            dgated = dg_forward * tanh_derivative(dgates_forward[3])
            # print(f'tanh_derivative Shape: {dgated.shape}\n')
            # print(f'Shape dg_forward : {dg_forward.shape}')
            dgates_forward[3][:self.hidden_size] = np.sum(dg_forward[:self.hidden_size] * tanh_derivative(dgates_forward[3])[:self.hidden_size], axis=1) # Error disini
            dgates_forward[0][:self.hidden_size] = np.sum(dc_forward[:self.hidden_size] * self.gates_forward[t, 0], axis=1)
            dgates_forward[1][:self.hidden_size] = np.sum(dc_forward[:self.hidden_size] * self.c_forward[t], axis=1)
            dgates_forward[2][:self.hidden_size] = np.sum(dh_forward[:self.hidden_size] * tanh(self.c_forward[t + 1]) * sigmoid_derivative(dgates_forward[2]), axis=1)

            dgates_backward = self.gates_backward[t]
            dgates_backward[3][:self.hidden_size] = np.sum(dg_backward[:self.hidden_size] * tanh_derivative(dgates_backward[3]))
            dgates_backward[0][:self.hidden_size] = np.sum(dc_backward[:self.hidden_size] * self.gates_backward[t, 0])
            dgates_backward[1][:self.hidden_size] = np.sum(dc_backward[:self.hidden_size] * self.c_backward[t])
            dgates_backward[2][:self.hidden_size] = np.sum(dh_backward[:self.hidden_size] * tanh(self.c_backward[t + 1]) * sigmoid_derivative(dgates_backward[2]))
            
            x_transpose = np.array([x[t]])
            dgates_transpose_forward_0 = np.array([dgates_forward[0]])
            dgates_transpose_forward_1 = np.array([dgates_forward[1]])
            dgates_transpose_forward_2 = np.array([dgates_forward[2]])
            dgates_transpose_forward_3 = np.array([dgates_forward[3]])

            dgates_transpose_backward_0 = np.array([dgates_backward[0]])
            dgates_transpose_backward_1 = np.array([dgates_backward[1]])
            dgates_transpose_backward_2 = np.array([dgates_backward[2]])
            dgates_transpose_backward_3 = np.array([dgates_backward[3]])
            
            dW_i += dgates_transpose_forward_0.T @ x_transpose + dgates_transpose_backward_0.T @ x_transpose
            dU_i += dgates_forward[0] @ self.h_forward[t].T + dgates_backward[0] @ self.h_backward[t].T
            db_i += dgates_transpose_forward_0.T + dgates_transpose_backward_0.T
            
            dW_f += dgates_transpose_forward_1.T @ x_transpose + dgates_transpose_backward_1.T @ x_transpose # Error disini
            dU_f += dgates_forward[1] @ self.h_forward[t].T + dgates_backward[1] @ self.h_backward[t].T
            db_f += dgates_transpose_forward_1.T + dgates_transpose_backward_1.T
            
            dW_o += dgates_transpose_forward_2.T @ x_transpose + dgates_transpose_backward_2.T @ x_transpose
            dU_o += dgates_forward[2] @ self.h_forward[t].T + dgates_backward[2] @ self.h_backward[t].T
            db_o += dgates_transpose_forward_2.T + dgates_transpose_backward_2.T
            
            dW_c += dgates_transpose_forward_3.T @ x_transpose + dgates_transpose_backward_3.T @ x_transpose
            dU_c += dgates_forward[3] @ self.h_forward[t].T + dgates_backward[3] @ self.h_backward[t].T
            db_c += dgates_transpose_forward_3.T + dgates_transpose_backward_3.T
            
            dh_next = self.U_i.T @ dgates_forward[0] + self.U_f.T @ dgates_forward[1] + self.U_o.T @ dgates_forward[2] + self.U_c.T @ dgates_forward[3]
            dc_next = dc_forward * self.gates_forward[t, 1]
        
        # Adam Optimizer
        for param, dparam, m, v in zip([self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o, self.W_c, self.U_c, self.b_c, self.W_y, self.b_y],
                               [dW_i, dU_i, db_i, dW_f, dU_f, db_f, dW_o, dU_o, db_o, dW_c, dU_c, db_c, dW_y, db_y],
                               [self.mW_i, self.mU_i, self.mb_i, self.mW_f, self.mU_f, self.mb_f, self.mW_o, self.mU_o, self.mb_o, self.mW_c, self.mU_c, self.mb_c, self.mW_y, self.mb_y],
                               [self.vW_i, self.vU_i, self.vb_i, self.vW_f, self.vU_f, self.vb_f, self.vW_o, self.vU_o, self.vb_o, self.vW_c, self.vU_c, self.vb_c, self.vW_y, self.vb_y]):
            # print(f'Nilai Gradien Awal:\n{dparam}\n')
            m = beta1 * m + (1 - beta1) * dparam
            # print(f'Memperbaharui Estimasi momen pertama yang bias:\n {m}\n')
            v = beta2 * v + (1 - beta2) * (dparam ** 2)
            # print(f'Memperbaharui perkiraan momen kedua yang bias:\n {v}\n')
            m_hat = m / (1 - beta1)
            # print(f'Hasil perhitungan estimasi momen pertama yang dikoreksi bias:\n {m_hat}\n')
            v_hat = v / (1 - beta2)
            # print(f'Hasil perhitungan estimasi momen mentah kedua yang dikoreksi bias:\n {v_hat}\n')
            param -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
            # print(f'Nilai Update Parameter yang akan digunakan untuk epochs berikutnya: \n{param}')
        
        # for param in [dW_i, dU_i, db_i, dW_f, dU_f, db_f, dW_o, dU_o, db_o, dW_c, dU_c, db_c, dW_y, db_y]:
        #     np.clip(param, -1, 1, out=param)
        
        # self.W_i -= learning_rate * dW_i
        # self.U_i -= learning_rate * dU_i
        # self.b_i -= learning_rate * db_i
        
        # self.W_f -= learning_rate * dW_f
        # self.U_f -= learning_rate * dU_f
        # self.b_f -= learning_rate * db_f
        
        # self.W_o -= learning_rate * dW_o
        # self.U_o -= learning_rate * dU_o
        # self.b_o -= learning_rate * db_o
        
        # self.W_c -= learning_rate * dW_c
        # self.U_c -= learning_rate * dU_c
        # self.b_c -= learning_rate * db_c
        
        # self.W_y -= learning_rate * dW_y
        # self.b_y -= learning_rate * db_y
    
    def train(self, x, y, epochs, learning_rate):
        for epoch in range(epochs):
            outputs = self.forward_backward(x)
            self.backPropagation(x, y, learning_rate)
            # print(f'Hasil Prediksi {outputs}')
            outputs = np.clip(outputs, 1e-15, None)
            loss = -np.mean(np.log(outputs[np.arange(len(y)), y]))
            
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}')
    
    def predict(self, x):
        T = len(x)
        h_forward = np.zeros((T + 1, self.hidden_size))
        c_forward = np.zeros((T + 1, self.hidden_size))
        h_backward = np.zeros((T + 1, self.hidden_size))
        c_backward = np.zeros((T + 1, self.hidden_size))
        outputs = np.zeros((T, self.output_size))
        
        for t in range(T):
            # Forward
            gates_forward = np.concatenate([
                sigmoid(self.W_i @ x[t] + self.U_i @ h_forward[t] + self.b_i),
                sigmoid(self.W_f @ x[t] + self.U_f @ h_forward[t] + self.b_f),
                sigmoid(self.W_o @ x[t] + self.U_o @ h_forward[t] + self.b_o),
                tanh(self.W_c @ x[t] + self.U_c @ h_forward[t] + self.b_c)
            ])
                
            c_forward[t + 1] = gates_forward[1] * c_forward[t] + gates_forward[0] * gates_forward[3]
            h_forward[t + 1] = gates_forward[2] * tanh(c_forward[t + 1])
                
            # Backward
            t_backward = T - t - 1
            gates_backward = np.concatenate([
                sigmoid(self.W_i @ x[t_backward] + self.U_i @ h_backward[t_backward] + self.b_i),
                sigmoid(self.W_f @ x[t_backward] + self.U_f @ h_backward[t_backward] + self.b_f),
                sigmoid(self.W_o @ x[t_backward] + self.U_o @ h_backward[t_backward] + self.b_o),
                tanh(self.W_c @ x[t_backward] + self.U_c @ h_backward[t_backward] + self.b_c)
            ])
                
            c_backward[t_backward + 1] = gates_backward[1] * c_backward[t_backward] + gates_backward[0] * gates_backward[3]
            h_backward[t_backward + 1] = gates_backward[2] * tanh(c_backward[t_backward + 1])

            merged_output = np.concatenate([h_forward[t + 1], h_backward[t_backward + 1]])
            sigmoids = sigmoid(self.W_y @ merged_output + self.b_y)
            outputs[t] = sigmoids[0]
        
        return np.argmax(outputs, axis=1)
    
    def save_model(self, model, filename):
        with open(filename, 'wb') as f:
            pkl.dump(model, f)

<h2>Training & Testing Model</h2>

<h3>Train Model Bi-LSTM (Parameter embedding_size = 50)</h3>

<p>Bi-LSTM</p>

In [45]:
x_train = X_train_embedding_r
x_train = np.array(x_train)
# label = final_stack_label.astype('int')
y_trains = y_train

# Inisialisasi model Bi-LSTM
input_size = x_train.shape[1]
hidden_size = 6
output_size = 2
model_bilstm_embedsize_50 = BiLSTM(input_size, hidden_size, output_size)

In [36]:
x_train.shape

(3537, 3900)

In [37]:
y_trains.shape

(3537,)

In [38]:
X_test_embedding_r.shape

(394, 3900)

In [46]:
# Train ML Model
epochs = 300
learning_rate = 0.001
model_bilstm_embedsize_50.train(x_train, y_trains, epochs, learning_rate)

Epoch 1/300, Loss: 34.5388
Epoch 2/300, Loss: 34.5388
Epoch 3/300, Loss: 34.5388
Epoch 4/300, Loss: 34.5388
Epoch 5/300, Loss: 34.5388
Epoch 6/300, Loss: 34.5388
Epoch 7/300, Loss: 34.5388
Epoch 8/300, Loss: 34.5388
Epoch 9/300, Loss: 34.5388
Epoch 10/300, Loss: 34.5388
Epoch 11/300, Loss: 34.5388
Epoch 12/300, Loss: 34.5388
Epoch 13/300, Loss: 34.5388
Epoch 14/300, Loss: 34.5388
Epoch 15/300, Loss: 34.5388
Epoch 16/300, Loss: 34.5388
Epoch 17/300, Loss: 34.5388
Epoch 18/300, Loss: 34.5388
Epoch 19/300, Loss: 34.5388
Epoch 20/300, Loss: 34.5388
Epoch 21/300, Loss: 34.5388
Epoch 22/300, Loss: 34.5388
Epoch 23/300, Loss: 34.5388
Epoch 24/300, Loss: 34.5388
Epoch 25/300, Loss: 34.5388
Epoch 26/300, Loss: 34.5388
Epoch 27/300, Loss: 34.5388
Epoch 28/300, Loss: 34.5388
Epoch 29/300, Loss: 34.5388
Epoch 30/300, Loss: 34.5388
Epoch 31/300, Loss: 34.5388
Epoch 32/300, Loss: 34.5388
Epoch 33/300, Loss: 34.5388
Epoch 34/300, Loss: 34.5388
Epoch 35/300, Loss: 34.5388
Epoch 36/300, Loss: 34.5388
E

In [47]:
# Predict
output_predict = []
for i in range(len(X_test_embedding_r)):
    output_predict.append(model_bilstm_embedsize_50.predict(np.array([X_test_embedding_r[i]])))

In [48]:
output_predict

[array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], 

<h2>Pengujian Performansi</h2>

<h3>Bi-LSTM Without SMOTE</h3>

In [49]:
df_result_WS = pd.DataFrame.from_records(output_predict, columns=['Predict'])

In [50]:
df_result_WS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394 entries, 0 to 393
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Predict  394 non-null    int64
dtypes: int64(1)
memory usage: 3.2 KB


In [51]:
output_actual_WS = []

for i in y_test:
    output_actual_WS.append(i)

In [52]:
df_actual_WS = pd.DataFrame(output_actual_WS, columns=['Actual'])

In [53]:
df_final_result_WS = df_result_WS.join(df_actual_WS)

In [54]:
df_final_result_WS.head()

Unnamed: 0,Predict,Actual
0,0,0
1,0,1
2,0,0
3,0,1
4,0,1


In [55]:
for i in range(len(df_final_result_WS)):
  if (df_final_result_WS.iloc[i]['Predict'] == 1 and df_final_result_WS.iloc[i]['Actual'] == 1):
    df_final_result_WS.loc[i, 'Category'] = 'TP'
  elif (df_final_result_WS.iloc[i]['Predict'] == 0 and df_final_result_WS.iloc[i]['Actual'] == 0):
    df_final_result_WS.loc[i, 'Category'] = 'TN'
  elif (df_final_result_WS.iloc[i]['Predict'] == 1 and df_final_result_WS.iloc[i]['Actual'] == 0):
    df_final_result_WS.loc[i, 'Category'] = 'FP'
  elif (df_final_result_WS.iloc[i]['Predict'] == 0 and df_final_result_WS.iloc[i]['Actual'] == 1):
    df_final_result_WS.loc[i, 'Category'] = 'FN'

In [59]:
TP_WS = len(df_final_result_WS[df_final_result_WS['Category']=='TP'])
TN_WS = len(df_final_result_WS[df_final_result_WS['Category']=='TN'])
FP_WS = len(df_final_result_WS[df_final_result_WS['Category']=='FP'])
FN_WS = len(df_final_result_WS[df_final_result_WS['Category']=='FN'])

accuracy_WS = round((TP_WS + TN_WS) / (TP_WS + TN_WS + FP_WS + FN_WS), 2) * 100
precision_WS = 0 if TP_WS == 0 or FP_WS == 0 else round(TP_WS / (TP_WS + FP_WS), 2) * 100
recall_WS = round(TP_WS / (TP_WS + FN_WS), 2) * 100
f_measure_WS = 0 if precision_WS == 0 else round(2 * precision_WS * recall_WS / (precision_WS + recall_WS), 2)

In [60]:
print(f'TP Bi-LSTM Scratch: {TP_WS}\n')
print(f'TN Bi-LSTM Scratch: {TN_WS}\n')
print(f'FP Bi-LSTM Scratch: {FP_WS}\n')
print(f'FN Bi-LSTM Scratch: {FN_WS}\n')

TP Bi-LSTM Scratch: 0

TN Bi-LSTM Scratch: 209

FP Bi-LSTM Scratch: 0

FN Bi-LSTM Scratch: 185



In [61]:
print(f'Accuracy: {accuracy_WS}%')
print(f'Precision: {precision_WS}%')
print(f'Recall: {recall_WS}%')
print(f'F1-Score: {f_measure_WS}%')

Accuracy: 53.0%
Precision: 0%
Recall: 0.0%
F1-Score: 0%


<h3>Bi-LSTM + SMOTE</h3>

In [68]:
df_result = pd.DataFrame.from_records(output_predict, columns=['Predict'])

In [69]:
df_result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394 entries, 0 to 393
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Predict  394 non-null    int64
dtypes: int64(1)
memory usage: 3.2 KB


In [70]:
output_actual = []

for i in y_test:
    output_actual.append(i)

In [71]:
df_actual = pd.DataFrame(output_actual, columns=['Actual'])

In [72]:
df_final_result = df_result.join(df_actual)

In [73]:
df_final_result.head(11)

Unnamed: 0,Predict,Actual
0,0,0
1,0,1
2,0,0
3,0,1
4,0,1
5,0,1
6,0,0
7,1,1
8,1,0
9,1,0


In [74]:
for i in range(len(df_final_result)):
  if (df_final_result.iloc[i]['Predict'] == 1 and df_final_result.iloc[i]['Actual'] == 1):
    df_final_result.loc[i, 'Category'] = 'TP'
  elif (df_final_result.iloc[i]['Predict'] == 0 and df_final_result.iloc[i]['Actual'] == 0):
    df_final_result.loc[i, 'Category'] = 'TN'
  elif (df_final_result.iloc[i]['Predict'] == 1 and df_final_result.iloc[i]['Actual'] == 0):
    df_final_result.loc[i, 'Category'] = 'FP'
  elif (df_final_result.iloc[i]['Predict'] == 0 and df_final_result.iloc[i]['Actual'] == 1):
    df_final_result.loc[i, 'Category'] = 'FN'

In [79]:
TP = len(df_final_result[df_final_result['Category']=='TP'])
TN = len(df_final_result[df_final_result['Category']=='TN'])
FP = len(df_final_result[df_final_result['Category']=='FP'])
FN = len(df_final_result[df_final_result['Category']=='FN'])

accuracy = round((TP + TN) / (TP + TN + FP + FN), 2) * 100
precision = round(TP / (TP + FP), 2) * 100
recall = round(TP / (TP + FN), 2) * 100
f_measure = round(2 * precision * recall / (precision + recall), 2)

In [80]:
print(f'TP Bi-LSTM Scratch: {TP}\n')
print(f'TN Bi-LSTM Scratch: {TN}\n')
print(f'FP Bi-LSTM Scratch: {FP}\n')
print(f'FN Bi-LSTM Scratch: {FN}\n')

TP Bi-LSTM Scratch: 135

TN Bi-LSTM Scratch: 118

FP Bi-LSTM Scratch: 91

FN Bi-LSTM Scratch: 50



In [81]:
print(f'Accuracy: {accuracy}%')
print(f'Precision: {precision}%')
print(f'Recall: {recall}%')
print(f'F1-Score: {f_measure}%')

Accuracy: 64.0%
Precision: 60.0%
Recall: 73.0%
F1-Score: 65.86%


<h1>Saving Model</h1>

In [None]:
# import h5py

# # Nama file .h5 untuk menyimpan model
# filename = 'modelBiLSTM_Scratch.h5'

# # Membuka file .h5 untuk penulisan
# with h5py.File(filename, 'w') as hf:
#     # Simpan parameter-parameter objek Bi-LSTM ke dalam grup 'bilstm'
#     bilstm_group = hf.create_group('bilstm')
    
#     bilstm_group.create_dataset('W_i', data=model_bilstm_embedsize_50.W_i)
#     bilstm_group.create_dataset('U_i', data=model_bilstm_embedsize_50.U_i)
#     bilstm_group.create_dataset('b_i', data=model_bilstm_embedsize_50.b_i)

#     bilstm_group.create_dataset('W_i', data=model_bilstm_embedsize_50.W_i)
#     bilstm_group.create_dataset('U_i', data=model_bilstm_embedsize_50.U_i)
#     bilstm_group.create_dataset('b_i', data=model_bilstm_embedsize_50.b_i)
    
#     # Simpan parameter-parameter lainnya ...
    
# print(f'Model disimpan dalam file {filename}')

<h3>Testing Result</h3>
<p>July 20 : 50%</p>
<p>July 21 : 53%</p>
<p>July 22 : 54%</p>
<p>July 25 : 61%</p>
<p>August 2 : 66%</p>

<h1>Sample Datasets BAB 3</h1>

In [None]:
df_bab3 = pd.read_excel('Misc/Sample_Datasets_Bab3_Ready.xlsx')
df_bab3.info()
df_bab3.head()

In [None]:
# Remove unused column/feature
df_bab3 = df_bab3.drop(columns='Unnamed: 0')
df_bab3 = df_bab3.drop(columns='Text')
df_bab3 = df_bab3.drop(columns='emoji')
df_bab3 = df_bab3.drop(columns='Tokenized')
df_bab3 = df_bab3.drop(columns='final_text')
df_bab3 = df_bab3.drop(columns='Pos_Word')
df_bab3 = df_bab3.drop(columns='Neg_Word')
df_bab3 = df_bab3.drop(columns='Total_Word')
df_bab3 = df_bab3.drop(columns='Pos_Ratio')
df_bab3 = df_bab3.drop(columns='Neg_Ratio')
df_bab3 = df_bab3.drop(columns='Tokenize_Emoji')
df_bab3 = df_bab3.drop(columns='Pos_Emoji')
df_bab3 = df_bab3.drop(columns='Neg_Emoji')
df_bab3 = df_bab3.drop(columns='Sentimen_Emoji')

In [None]:
df_bab3['Sarcasm'].value_counts()

In [None]:
df_bab3['text_emoji'][0]

<h2>Word Embedding</h2>

In [None]:
# Membangun model GloVe
glove_model = GloVe(df_bab3['text_emoji'])

# Membangun vocabulary
glove_model.build_vocab_information()

# Membangun co-occurrence matrix
glove_model.build_co_matrix()

# Melatih model GloVe
glove_model.train()

In [None]:
for word, id in glove_model.word2count.items():
    print(f'{word}: {id}')

In [None]:
# glove_model.generate_coomatrix_image()

In [None]:
glove_model.get_all_embeddings()

<h2>SMOTE</h2>

In [None]:
df_bab3['Sentimen_Text'] = df_bab3['Sentimen_Text'].replace('Positif', 1)
df_bab3['Sentimen_Text'] = df_bab3['Sentimen_Text'].replace('Negatif', 0)

df_bab3['Sarcasm'] = df_bab3['Sarcasm'].replace('Positif', 1)
df_bab3['Sarcasm'] = df_bab3['Sarcasm'].replace('Negatif', 0)

In [None]:
# Cek class minoritas
df_bab3['Sarcasm'].value_counts()

In [None]:
max_len = 50
len_voc = 40000

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_bab3, test_size=0.3, random_state=42, shuffle=False)

df_train_min = df_train[df_train['Sarcasm'] == 0]
df_train_maj = df_train[df_train['Sarcasm'] == 1]

df_test_min = df_test[df_test['Sarcasm'] == 0]
df_test_maj = df_test[df_test['Sarcasm'] == 1]

df_ready_min = pd.concat([df_train_min, df_test_min])
df_ready_maj = pd.concat([df_train_maj, df_test_maj])

In [None]:
# Tokenizer
tokenizer_min = make_tokenizer(df_ready_min['text_emoji'], len_voc)
tokenizer_maj = make_tokenizer(df_ready_maj['text_emoji'], len_voc)

In [None]:
# Sequence
X_train_min = tokenizer_min.texts_to_sequences(df_train_min['text_emoji'])
X_test_min = tokenizer_min.texts_to_sequences(df_test_min['text_emoji'])
X_train_major = tokenizer_maj.texts_to_sequences(df_train_maj['text_emoji'])
X_test_major = tokenizer_maj.texts_to_sequences(df_test_maj['text_emoji'])

In [None]:
df_train_min

In [None]:
X_train_min

In [None]:
# Padding
from keras.utils import pad_sequences

X_train_min = pad_sequences(X_train_min, maxlen=max_len, padding='post', truncating='post')
X_test_min = pad_sequences(X_test_min, maxlen=max_len, padding='post', truncating='post')
X_train_major = pad_sequences(X_train_major, maxlen=max_len, padding='post', truncating='post')
X_test_major = pad_sequences(X_test_major, maxlen=max_len, padding='post', truncating='post')

In [None]:
# Stacking Label Minority
y_train_min = df_train_min['Sarcasm'].values
y_test_min = df_test_min['Sarcasm'].values

stack_label_min = np.hstack((y_train_min, y_test_min))

stack_label_min.shape

In [None]:
# Stacking Label Majority
y_train_maj = df_train_maj['Sarcasm'].values
y_test_maj = df_test_maj['Sarcasm'].values

stack_label_maj = np.hstack((y_train_maj, y_test_maj))

stack_label_maj.shape

In [None]:
# Stacking All Data Minority & Majority
embed_mat_min = make_embedding_matrix(glove, tokenizer_min, len_voc)
embed_mat_maj = make_embedding_matrix(glove, tokenizer_maj, len_voc)

X_train_emb_minority = embed_mat_min[X_train_min]
X_test_emb_minority = embed_mat_min[X_test_min]

X_train_emb_majority = embed_mat_maj[X_train_major]
X_test_emb_majority = embed_mat_maj[X_test_major]

train_size_min, max_len_min, embed_size_min = X_train_emb_minority.shape
X_train_emb_r_min = X_train_emb_minority.reshape(train_size_min, max_len*embed_size_min)

test_size_min, max_len_min, embed_size_min = X_test_emb_minority.shape
X_test_emb_r_min = X_test_emb_minority.reshape(test_size_min, max_len*embed_size_min)

train_size_maj, max_len_maj, embed_size_maj = X_train_emb_majority.shape
X_train_emb_r_maj = X_train_emb_majority.reshape(train_size_maj, max_len*embed_size_maj)

test_size_maj, max_len_maj, embed_size_maj = X_test_emb_majority.shape
X_test_emb_r_maj = X_test_emb_majority.reshape(test_size_maj, max_len*embed_size_maj)

stack_minority = np.vstack((X_train_emb_r_min, X_test_emb_r_min))

stack_majority = np.vstack((X_train_emb_r_maj, X_test_emb_r_maj))

In [None]:
X_train_emb_minority[1]

In [None]:
print(f'Shape stack minority: {stack_minority.shape}')
print(f'Shape stack majority{stack_majority.shape}')

In [None]:
# Running SMOTE
total_size = train_size_maj + train_size_min + test_size_maj + test_size_min

X_minority = stack_minority
y_minority = stack_label_min
X_majority = stack_majority
y_majority = stack_label_maj

X_oversampled, y_oversampled = smote(X_minority, X_majority, y_minority, k=3)

In [None]:
final_stack = np.vstack((stack_majority, X_oversampled))

len(final_stack)

In [None]:
final_stack = np.delete(final_stack, np.s_[10::], axis=0)

In [None]:
final_stack_label = np.hstack((stack_label_maj, y_oversampled))

final_stack_label = np.delete(final_stack_label, np.s_[10::], axis=0)

In [None]:
print(f'Shape : {final_stack.shape}\n')
print(final_stack)

In [None]:
import matplotlib.pyplot as plt

x = ['Sebelum', 'Sesudah']
y = [len(stack_majority) + len(stack_minority), len(final_stack)]


plt.bar(x, y, color=[(0.38, 0.62, 0.79), (0.98, 0.64, 0.33)], edgecolor='black', linewidth=1.5)

for i in range(len(x)):
    plt.text(x[i], y[i], str(y[i]), ha='center', va='bottom')

plt.ylabel('Jumlah Sample')
plt.title('Perbandingan Data Setelah Diimplementasi SMOTE')

plt.show()

In [None]:
x = ['0', '1']
y = [df_bab3['Sarcasm'].value_counts()[0], df_bab3['Sarcasm'].value_counts()[1]]


plt.bar(x, y, color=[(0.38, 0.62, 0.79), (0.98, 0.64, 0.33)], edgecolor='black', linewidth=1.5)

for i in range(len(x)):
    plt.text(x[i], y[i], str(y[i]), ha='center', va='bottom')

plt.ylabel('Jumlah Sample')
plt.title('Jumlah Sample Masing-Masing Kelas Sebelum Di Implementasi SMOTE')

plt.show()

In [None]:
# Menghitung Jumlah Akhir Sample pada Class 0 dan 1
unique, counts = np.unique(final_stack_label, return_counts=True)
result = np.asarray((unique, counts)).T.astype(int)

print(result)

In [None]:
x = ['0', '1']
y = [result[0][1], result[1][1]]


plt.bar(x, y, color=[(0.38, 0.62, 0.79), (0.98, 0.64, 0.33)], edgecolor='black', linewidth=1.5)

for i in range(len(x)):
    plt.text(x[i], y[i], str(y[i]), ha='center', va='bottom')

plt.ylabel('Jumlah Sample')
plt.title('Jumlah Sample Masing-Masing Kelas Setelah Di Implementasi SMOTE')

plt.show()

<h2>Bi-LSTM</h2>

In [None]:
x_train = final_stack
x_train = np.array(x_train)
label = final_stack_label.astype('int')
y_train = label

# Inisialisasi model Bi-LSTM
input_size = x_train.shape[1]
hidden_size = 8
output_size = 2
bilstm = BiLSTM(input_size, hidden_size, output_size)

In [None]:
x_train

In [None]:
# Train ML Model
epochs = 100
learning_rate = 0.01
bilstm.train(x_train, y_train, epochs, learning_rate)

In [None]:
y_train

In [None]:
x_train[9]

In [None]:
# Predict ML Model
print(f'Predict Data 1 : {bilstm.predict(np.array([x_train[0]]))}\n')
print(f'Predict Data 2 : {bilstm.predict(np.array([x_train[1]]))}\n')
print(f'Predict Data 3 : {bilstm.predict(np.array([x_train[2]]))}\n')
print(f'Predict Data 4 : {bilstm.predict(np.array([x_train[3]]))}\n')
print(f'Predict Data 5 : {bilstm.predict(np.array([x_train[4]]))}\n')
print(f'Predict Data 6 : {bilstm.predict(np.array([x_train[5]]))}\n')
print(f'Predict Data 7 : {bilstm.predict(np.array([x_train[6]]))}\n')
print(f'Predict Data 8 : {bilstm.predict(np.array([x_train[7]]))}\n')
print(f'Predict Data 9 : {bilstm.predict(np.array([x_train[8]]))}\n')
print(f'Predict Data 10 : {bilstm.predict(np.array([x_train[9]]))}\n')