[](http://e7.pngegg.com/pngimages/360/846/png-clipart-human-behavior-thumb-homo-sapiens-sarcasm-logo-cartoon.png)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

from sklearn import model_selection, preprocessing, linear_model, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import ensemble
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from xgboost import XGBClassifier
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from termcolor import colored
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


import nltk
from nltk.corpus import stopwords
from textblob import Word

from nltk.tokenize import word_tokenize
from tqdm import tqdm
import re

from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential


from nltk.stem import PorterStemmer
import string

from warnings import filterwarnings
filterwarnings('ignore')

from sklearn import set_config
set_config(print_changed_only = False)

print(colored("\nLIBRARIES WERE SUCCESFULLY IMPORTED...", "green"))

[32m
LIBRARIES WERE SUCCESFULLY IMPORTED...[0m


In [2]:
df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [3]:
df = df.drop(columns=['article_link'])

In [4]:
#get basic information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
dtypes: int64(1), object(1)
memory usage: 447.3+ KB


In [5]:
# check whether there are duplicated values
# renaming the cols
df.rename(columns={'is_sarcastic':'sarcas','headline':'text'},inplace=True)
df.duplicated().sum()

116

In [6]:
# drop duplicated values from the dataset
df.drop_duplicates(inplace = True)

In [7]:
#get the number of classes of the "label" variable of dataset
df.groupby("sarcas").count().style.background_gradient(cmap = "autumn")

Unnamed: 0_level_0,text
sarcas,Unnamed: 1_level_1
0,14951
1,13552


<a id='top'></a>
<div class="list-group" id="list-tab" role="tablist">
<p style="background-color:#808000 ;font-family:arial;color:#FFFFFF;font-size:150%;text-align:center;border-radius:55px 1px;">Preprocess the dataset</p>

In [8]:
#convert uppercase letters to lowercase letters

df["text"] = df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))

print(colored("\nCONVERTED SUCCESFULLY...", "green"))

[32m
CONVERTED SUCCESFULLY...[0m


In [9]:
#delete punctuation marks

df["text"] = df["text"].str.replace('[^\w\s]','')

print(colored("\nDELETED PUNCTUATION MARKS SUCCESFULLY...", "green"))

[32m
DELETED PUNCTUATION MARKS SUCCESFULLY...[0m


In [10]:
#delete numbers

df["text"] = df["text"].str.replace('\d','')

print(colored("\n NUMBERS DELETED SUCCESFULLY...", "green"))

[32m
 NUMBERS DELETED SUCCESFULLY...[0m


In [11]:
#delete stopwords

sw = stopwords.words("english")
df["text"] = df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

print(colored("\nSTOPWORDS DELETED SUCCESFULLY...", "green"))

[32m
STOPWORDS DELETED SUCCESFULLY...[0m


In [12]:
#lemmatization. That is, we get the roots of the words

df["text_prc"] = df["text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

print(colored("\nDONE SUCCESFULLY...", "green"))

[32m
DONE SUCCESFULLY...[0m


In [13]:
ps = PorterStemmer()

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
    return " ".join(y)

In [14]:
df['stem_text'] = df['text_prc'].apply(transform_text)

In [15]:
df.head()

Unnamed: 0,sarcas,text,text_prc,stem_text
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientist unveil doomsday cloc...,thirtysometh scientist unveil doomsday clock h...
1,0,dem rep. totally nails congress falling short ...,dem rep. totally nail congress falling short g...,dem total nail congress fall short gender raci...
2,0,eat veggies: 9 deliciously different recipes,eat veggies: 9 deliciously different recipe,eat veggi 9 delici differ recip
3,1,inclement weather prevents liar getting work,inclement weather prevents liar getting work,inclement weather prevent liar get work
4,1,mother comes pretty close using word 'streamin...,mother come pretty close using word 'streaming...,mother come pretti close use word correctli


In [16]:
#divide the dataset into test and train sets

x = df["stem_text"]
y = df["sarcas"]

train_x, test_x, train_y, test_y = model_selection.train_test_split(x, y,
                                                                    test_size = 0.20,
                                                                    shuffle = True,
                                                                    random_state = 11)

print(colored("\nDIVIDED SUCCESFULLY...", "green"))

[32m
DIVIDED SUCCESFULLY...[0m


In [17]:
print(train_x.shape, test_x.shape)

(22802,) (5701,)


<a id='top'></a>
<div class="list-group" id="list-tab" role="tablist">
<p style="background-color:#808000 ;font-family:arial;color:#FFFFFF;font-size:150%;text-align:center;border-radius:55px 1px;">Vectorize dataset with TFidfVectorizer method...</p>

In [18]:
# tf_idf_word_vectorizer = TfidfVectorizer(analyzer = "word")
# tf_idf_word_vectorizer.fit(train_x)

# x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
# x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

# x_train_tf_idf_word.toarray()

<a id='top'></a>
<div class="list-group" id="list-tab" role="tablist">
<p style="background-color:#808000 ;font-family:arial;color:#FFFFFF;font-size:150%;text-align:center;border-radius:55px 1px;">Build machine learning models...</p>

In [19]:
# log = linear_model.LogisticRegression()
# log_model = log.fit(x_train_tf_idf_word, train_y)
# accuracy = model_selection.cross_val_score(log_model,
#                                            x_test_tf_idf_word,
#                                            test_y,
#                                            cv = 20).mean()

# print("\nLogistic regression model with 'tf-idf' method")
# print("Accuracy ratio: ", accuracy)

In [20]:
# xgb = XGBClassifier()
# xgb_model = xgb.fit(x_train_tf_idf_word,train_y)
# accuracy = model_selection.cross_val_score(xgb_model,
#                                            x_test_tf_idf_word,
#                                            test_y,
#                                            cv = 20).mean()

# print("\nXGBoost model with 'tf-idf' method")
# print("Accuracy ratio: ", accuracy)

<a id='top'></a>
<div class="list-group" id="list-tab" role="tablist">
<p style="background-color:#808000 ;font-family:arial;color:#FFFFFF;font-size:150%;text-align:center;border-radius:55px 1px;">Build deep learning models...</p>

In [21]:
unique_words = set()
len_max = 0

for sent in tqdm(train_x):
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
print(len(list(unique_words)))
print(len_max)

100%|████████████████████████████████████████████████████████████████████████| 22802/22802 [00:00<00:00, 278840.07it/s]

52
613





In [22]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(train_x))

train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)

train_x = sequence.pad_sequences(train_x, maxlen=len_max)
test_x = sequence.pad_sequences(test_x, maxlen=len_max)

print(train_x.shape, test_x.shape)

(22802, 613) (5701, 613)


In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import joblib

joblib.dump(tokenizer, 'tokenizer_lstm.joblib')
print("tokenizer_lstm saved with joblib")

tokenizer_lstm saved with joblib


In [24]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = "auto", verbose = 1,
                               monitor = "val_acc",
                               patience = 3)
callbacks = [early_stopping]

In [25]:
# model = Sequential()
# model.add(Embedding(len(list(unique_words)), 100, input_length = len_max))
# model.add(LSTM(64, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))
# model.add(Dense(25, activation = "relu"))
# model.add(Dropout(0.5))
# model.add(Dense(1, activation = "sigmoid"))
# model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = 0.0045),
#               metrics = ["accuracy"])
# model.summary()

In [26]:
from tensorflow import keras

In [27]:
# model = keras.Sequential([
#     keras.layers.Embedding(len(list(unique_words)), 100, input_length=len_max),
#     keras.layers.Bidirectional(keras.layers.LSTM(64)),
#     keras.layers.Dense(24, activation='relu'),
#     keras.layers.Dense(1, activation='sigmoid')
# ])
# # compile model
# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])
# # model summary
# model.summary()

In [28]:
# history = model.fit(train_x, train_y, epochs = 5, validation_data = (test_x, test_y), 
#                   batch_size = 16, verbose = 1, callbacks = callbacks)
# history = model.fit(train_x, train_y,
#                     epochs=5, verbose=1,
#                     validation_split=0.1)

In [29]:
# history2 = model.fit(train_x, train_y, 
#                     epochs=10, verbose=1, 
#                     validation_split=0.1)

In [30]:
# history3 = model.fit(train_x, train_y, 
#                     epochs=50, verbose=1, 
#                     validation_split=0.1)

In [31]:
# epoch_num = range(1, len(history3.history["loss"]) + 1)
# plt.plot(epoch_num, history3.history["loss"], "r--")
# plt.plot(epoch_num, history3.history["val_loss"], "b-")
# plt.legend(["Training loss", "Validation loss"])
# plt.xlabel("Epoch numbers")
# plt.ylabel("Loss")
# plt.show()

In [32]:
# model.save('LSTM.keras')

In [33]:
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'

train_x, test_x, train_y, test_y = model_selection.train_test_split(x, y,
                                                                    test_size = 0.20,
                                                                    shuffle = True,
                                                                    random_state = 11)

# tokenizer = Tokenizer(num_words=len(list(unique_words)))
# tokenizer.fit_on_texts(list(train_x))

tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(list(train_x))
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_x)
train_padded = sequence.pad_sequences(train_sequences, padding='post', maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(test_x)
test_padded = sequence.pad_sequences(test_sequences, padding='post', maxlen=max_length)


In [34]:
from tensorflow import keras
from tensorflow.keras import regularizers

model2 = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64, 
                                                 kernel_regularizer=regularizers.l2(0.01))),  # Hanya regularizer L2 pada LSTM
    # Menghapus salah satu Dropout layer
    keras.layers.Dense(24, activation='relu'),  # Menghapus L2 regularization pada Dense layer
    keras.layers.Dropout(0.3),  # Mengganti dropout rate dari 0.5 ke 0.3
    keras.layers.Dense(1, activation='sigmoid')
])

# Compile model
model2.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])

# Model summary
model2.summary()

In [35]:
#num_epochs = 5
#history_ex = model2.fit(train_padded, train_y, 
  #                  epochs=num_epochs, verbose=1, 
   #                 validation_split=0.2)

In [36]:
from tensorflow.keras.callbacks import Callback

num_epochs = 5

class SaveAtEpoch(Callback):
    def __init__(self, save_epoch, save_path):
        super(SaveAtEpoch, self).__init__()
        self.save_epoch = save_epoch
        self.save_path = save_path

    def on_epoch_end(self, epoch, logs=None):
        if epoch + 1 == self.save_epoch:
            self.model.save(self.save_path, save_format='keras')
            print(f"\nModel saved at epoch {self.save_epoch}")

# Inisialisasi callback
save_at_epoch_2 = SaveAtEpoch(save_epoch=2, save_path='LSTM_MODEL_SARCASM.keras')

# Train the model and save at epoch 2
history_ex = model2.fit(train_padded, train_y, 
                        epochs=num_epochs, 
                        validation_split=0.2, 
                        verbose=1, 
                        callbacks=[save_at_epoch_2])

Epoch 1/5
[1m571/571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 194ms/step - accuracy: 0.6310 - loss: 1.2636 - val_accuracy: 0.7814 - val_loss: 0.4691
Epoch 2/5
[1m571/571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step - accuracy: 0.8153 - loss: 0.4294




Model saved at epoch 2
[1m571/571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 190ms/step - accuracy: 0.8153 - loss: 0.4295 - val_accuracy: 0.7781 - val_loss: 0.4730
Epoch 3/5
[1m571/571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 197ms/step - accuracy: 0.8381 - loss: 0.3919 - val_accuracy: 0.7827 - val_loss: 0.4613
Epoch 4/5
[1m571/571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 216ms/step - accuracy: 0.8659 - loss: 0.3404 - val_accuracy: 0.7779 - val_loss: 0.4805
Epoch 5/5
[1m571/571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 222ms/step - accuracy: 0.8731 - loss: 0.3248 - val_accuracy: 0.7748 - val_loss: 0.5247


In [38]:
from tensorflow.keras.models import load_model

# Load the model saved at epoch 2
best_model_epoch_2 = load_model('LSTM_MODEL_SARCASM.keras')

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Dapatkan prediksi untuk data training dan testing
train_predictions_epoch_2 = (best_model_epoch_2.predict(train_padded) > 0.5).astype("int32")
test_predictions_epoch_2 = (best_model_epoch_2.predict(test_padded) > 0.5).astype("int32")

# Menghitung metrik pada data training
train_accuracy_epoch_2 = accuracy_score(train_y, train_predictions_epoch_2)
train_precision_epoch_2 = precision_score(train_y, train_predictions_epoch_2)
train_recall_epoch_2 = recall_score(train_y, train_predictions_epoch_2)
train_f1_epoch_2 = f1_score(train_y, train_predictions_epoch_2)

# Menghitung metrik pada data testing
test_accuracy_epoch_2 = accuracy_score(test_y, test_predictions_epoch_2)
test_precision_epoch_2 = precision_score(test_y, test_predictions_epoch_2)
test_recall_epoch_2 = recall_score(test_y, test_predictions_epoch_2)
test_f1_epoch_2 = f1_score(test_y, test_predictions_epoch_2)

# Cetak hasilnya
print(f"Training Accuracy (Epoch 2): {train_accuracy_epoch_2:.4f}")
print(f"Training Precision (Epoch 2): {train_precision_epoch_2:.4f}")
print(f"Training Recall (Epoch 2): {train_recall_epoch_2:.4f}")
print(f"Training F1-score (Epoch 2): {train_f1_epoch_2:.4f}")

print(f"Testing Accuracy (Epoch 2): {test_accuracy_epoch_2:.4f}")
print(f"Testing Precision (Epoch 2): {test_precision_epoch_2:.4f}")
print(f"Testing Recall (Epoch 2): {test_recall_epoch_2:.4f}")
print(f"Testing F1-score (Epoch 2): {test_f1_epoch_2:.4f}")

[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 59ms/step
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 62ms/step
Training Accuracy (Epoch 2): 0.8361
Training Precision (Epoch 2): 0.8233
Training Recall (Epoch 2): 0.8351
Training F1-score (Epoch 2): 0.8292
Testing Accuracy (Epoch 2): 0.7692
Testing Precision (Epoch 2): 0.7495
Testing Recall (Epoch 2): 0.7674
Testing F1-score (Epoch 2): 0.7584


In [None]:
# history_ex2 = model2.fit(train_padded, train_y, 
#                     epochs=10, verbose=1, 
#                     validation_split=0.1)

In [40]:
pip install tensorflow==2.9.0

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement tensorflow==2.9.0 (from versions: 2.12.0rc0, 2.12.0rc1, 2.12.0, 2.12.1, 2.13.0rc0, 2.13.0rc1, 2.13.0rc2, 2.13.0, 2.13.1, 2.14.0rc0, 2.14.0rc1, 2.14.0, 2.14.1, 2.15.0rc0, 2.15.0rc1, 2.15.0, 2.15.1, 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0)
ERROR: No matching distribution found for tensorflow==2.9.0
