<a href="https://colab.research.google.com/github/AnnweshaAdhikari/Sarcasm-detector-using-NLP./blob/main/Sarcasm_detector_using_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Preprocessing

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    "Hi I am a person",
    "A person who wants to have a little happiness"]

token_sentences = [
    "Is it okay to be just a person?",
    "Maybe I should just be a person"]

tokenizer = Tokenizer(oov_token="*oov*")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)
text_to_sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(text_to_sequences, padding="post")
print(text_to_sequences)
print(padded)
test_token = tokenizer.texts_to_sequences(token_sentences)
print(test_token)

{'*oov*': 1, 'a': 2, 'person': 3, 'hi': 4, 'i': 5, 'am': 6, 'who': 7, 'wants': 8, 'to': 9, 'have': 10, 'little': 11, 'happiness': 12}
[[4, 5, 6, 2, 3], [2, 3, 7, 8, 9, 10, 2, 11, 12]]
[[ 4  5  6  2  3  0  0  0  0]
 [ 2  3  7  8  9 10  2 11 12]]
[[1, 1, 1, 9, 1, 1, 2, 3], [1, 5, 1, 1, 1, 2, 3]]


In [None]:
## Dataset by Rishabh Mishra

!wget --no-check-certificate \
    https://storage.googleapis.com/learning-datasets/sarcasm.json \
    -O /tmp/sarcasm.json


--2024-05-22 21:37:27--  https://storage.googleapis.com/learning-datasets/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.115.207, 172.253.122.207, 172.253.63.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.115.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2024-05-22 21:37:27 (152 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [None]:
vocab_size=30000
embedding_dim=16
max_length=100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '*oov*'

In [None]:
## Looking at articles and deciding if they are sarcastic or not

# Tokenization

import json

with open("/tmp/sarcasm.json",'r') as file:
  datastore = json.load(file)

sentences = []
labels = []
urls = []
for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

training_size = 20000
training_sentences = sentences[0:training_size]
test_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
test_labels = labels[training_size:]

tokenizer = Tokenizer(oov_token="*oov*")
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
padded_training = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
padded_test = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#print(word_index)
print(padded_training[0])
print(padded_training.shape)

print(padded_test[0])
print(padded_test.shape)

[  328 12776   799  3405  2404    47   389  2214 12777     6  2614  8863
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(20000, 100)
[17706  1100  6663  9423    30 11505  2439     5   519   109     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0

In [None]:
# Embedding

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
import numpy as np

padded_training = np.array(padded_training)
training_labels = np.array(training_labels)
padded_testing = np.array(padded_test)
test_labels = np.array(test_labels)

In [None]:
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, 100, 16)           480000    
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
 2 (GlobalAveragePooling1D)                                      
                                                                 
 dense_32 (Dense)            (None, 24)                408       
                                                                 
 dense_33 (Dense)            (None, 1)                 25        
                                                                 
Total params: 480433 (1.83 MB)
Trainable params: 480433 (1.83 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
## Training and Testing

num_epoch = 30
history = model.fit(padded_training, training_labels,epochs = num_epoch, validation_data = (padded_test, test_labels), verbose=2)

Epoch 1/30
625/625 - 8s - loss: 0.6557 - accuracy: 0.5923 - val_loss: 0.5480 - val_accuracy: 0.8128 - 8s/epoch - 13ms/step
Epoch 2/30
625/625 - 6s - loss: 0.4009 - accuracy: 0.8481 - val_loss: 0.3725 - val_accuracy: 0.8462 - 6s/epoch - 9ms/step
Epoch 3/30
625/625 - 7s - loss: 0.2743 - accuracy: 0.8964 - val_loss: 0.3439 - val_accuracy: 0.8562 - 7s/epoch - 11ms/step
Epoch 4/30
625/625 - 6s - loss: 0.2121 - accuracy: 0.9225 - val_loss: 0.3609 - val_accuracy: 0.8450 - 6s/epoch - 9ms/step
Epoch 5/30
625/625 - 7s - loss: 0.1662 - accuracy: 0.9423 - val_loss: 0.3482 - val_accuracy: 0.8557 - 7s/epoch - 11ms/step
Epoch 6/30
625/625 - 6s - loss: 0.1309 - accuracy: 0.9571 - val_loss: 0.3579 - val_accuracy: 0.8562 - 6s/epoch - 9ms/step
Epoch 7/30
625/625 - 7s - loss: 0.1036 - accuracy: 0.9665 - val_loss: 0.4020 - val_accuracy: 0.8445 - 7s/epoch - 11ms/step
Epoch 8/30
625/625 - 6s - loss: 0.0825 - accuracy: 0.9762 - val_loss: 0.3983 - val_accuracy: 0.8532 - 6s/epoch - 9ms/step
Epoch 9/30
625/625 -

In [None]:
## Giving some sentences a 'sarcasm percentages'

example_sentences = [
    "Oh, I am so thrilled to be this lonely in life.",
    "I am going home tomorrow.",
    "Of course, you truly are the symbol of cleanliness."
]
# Tokenizing
example_sequences = tokenizer.texts_to_sequences(example_sentences)
# Padding
example_padded = pad_sequences(example_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# Testing
print(model.predict(example_padded))

[[9.113963e-09]
 [3.007744e-08]
 [9.996564e-01]]


In [None]:
# Embedding but with Long Short Tem Memory (LSTM)

model_new = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])
model_new.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model_new.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_19 (Embedding)    (None, 100, 16)           480000    
                                                                 
 bidirectional_6 (Bidirecti  (None, 100, 128)          41472     
 onal)                                                           
                                                                 
 bidirectional_7 (Bidirecti  (None, 64)                41216     
 onal)                                                           
                                                                 
 dense_34 (Dense)            (None, 24)                1560      
                                                                 
 dense_35 (Dense)            (None, 1)                 25        
                                                                 
Total params: 564273 (2.15 MB)
Trainable params: 5642

In [None]:
## Giving some sentences a 'sarcasm percentages'

example_sentences = [
    "Oh, I am so thrilled to be this lonely in life.",
    "I am going home tomorrow.",
    "Of course, you truly are the symbol of cleanliness."
]
# Tokenizing
example_sequences = tokenizer.texts_to_sequences(example_sentences)
# Padding
example_padded = pad_sequences(example_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# Testing
print(model_new.predict(example_padded))

[[0.49986562]
 [0.5009383 ]
 [0.50056624]]
