<a href="https://colab.research.google.com/github/FrodoBaggins87/NLP_Projects/blob/main/Sarcasm_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import Libraries

In [43]:
import json
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

##Load Data

In [44]:
import json

# Initialize lists to store data
sentences = []
labels = []
urls = []

# Open the file and read it line by line
with open('sarcasm.json', 'r') as f:
    line_number = 1
    for line in f:
        try:
            item = json.loads(line)
            sentences.append(item['headline'])
            labels.append(item['is_sarcastic'])
            urls.append(item['article_link'])
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line {line_number}: {e}")
        line_number += 1

# Now you have all the data in the lists
print(f"Total headlines: {len(sentences)}")
print(f"First headline: {sentences[0]}")
print(f"First label: {labels[0]}")
print(f"First URL: {urls[0]}")


Error decoding JSON on line 14880: Unterminated string starting at: line 1 column 33 (char 32)
Total headlines: 14879
First headline: thirtysomething scientists unveil doomsday clock of hair loss
First label: 1
First URL: https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205


##Tokenize

In [45]:
tokenizer=Tokenizer(oov_token="<OOV>") #make tokenizer object
tokenizer.fit_on_texts(sentences) #fitting only on training sentences not on testing sentences
word_index=tokenizer.word_index
sequences=tokenizer.texts_to_sequences(sentences)
padded=pad_sequences(sequences,padding='post')
print(padded[0])
print(padded.shape)
print(len(word_index))

[10803   321  2710  5699  2711     3   513   911     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
(14879, 152)
22326


##Slice senteces and labels into training and test dataset

In [46]:
percent=0.8
training_size=int(len(sentences)*percent)
training_sentences=sentences[0:training_size]
training_labels=labels[0:training_size]
testing_sentences=sentences[training_size:]
testing_labels=labels[training_size:]

##Tokenize training and testing separately

In [47]:
max_length=padded.shape[1]
trunc_type='post'
padding_type='post'
oov_token="<OOV>"
vocab_size=len(word_index) + 1

In [48]:
tokenizer=Tokenizer(num_words=vocab_size, oov_token=oov_token) #make tokenizer object

tokenizer.fit_on_texts(training_sentences) #fitting only on training sentences not on testing sentences

word_index=tokenizer.word_index

training_sequences=tokenizer.texts_to_sequences(training_sentences) #make sequences from tokens

training_padded=pad_sequences(training_sequences,
                              maxlen=max_length,
                              padding=padding_type,
                              truncating=trunc_type) #make padded sequences
print(training_padded[0])
print(training_padded.shape)

[9414  297 2514 4811 2515    3  494  746    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
(11903, 152)


In [49]:
testing_sequences=tokenizer.texts_to_sequences(testing_sentences) #making sequences for testing dataset
testing_padded=pad_sequences(testing_sequences,
                             maxlen=max_length,
                             padding=padding_type,
                             truncating=trunc_type) #padding
print(testing_padded[0])
print(testing_padded.shape)

[   83    49    60    48 19799 19800    41  7025   592  9169     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
(2976, 152)


##Make the model

In [50]:
embedding_dim=100

In [51]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

##Train Model

In [52]:
# Check the types of your data
print(type(training_padded))
print(type(training_labels))
print(type(testing_padded))
print(type(testing_labels))


<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>


In [53]:
# Convert labels to numpy arrays if they are not already
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

In [55]:
num_epochs=35
history=model.fit(training_padded,
                  training_labels,
                  epochs=num_epochs,
                  validation_data=(testing_padded,testing_labels),
                  verbose=2)

Epoch 1/35
372/372 - 8s - 22ms/step - accuracy: 0.9120 - loss: 0.2216 - val_accuracy: 0.8454 - val_loss: 0.3612
Epoch 2/35
372/372 - 10s - 26ms/step - accuracy: 0.9144 - loss: 0.2147 - val_accuracy: 0.8485 - val_loss: 0.3564
Epoch 3/35
372/372 - 8s - 22ms/step - accuracy: 0.9264 - loss: 0.1916 - val_accuracy: 0.8411 - val_loss: 0.3838
Epoch 4/35
372/372 - 10s - 28ms/step - accuracy: 0.9366 - loss: 0.1654 - val_accuracy: 0.8448 - val_loss: 0.3724
Epoch 5/35
372/372 - 10s - 26ms/step - accuracy: 0.9388 - loss: 0.1605 - val_accuracy: 0.8468 - val_loss: 0.3748
Epoch 6/35
372/372 - 8s - 21ms/step - accuracy: 0.9449 - loss: 0.1461 - val_accuracy: 0.8471 - val_loss: 0.3830
Epoch 7/35
372/372 - 11s - 28ms/step - accuracy: 0.9477 - loss: 0.1342 - val_accuracy: 0.8239 - val_loss: 0.4338
Epoch 8/35
372/372 - 10s - 27ms/step - accuracy: 0.9569 - loss: 0.1167 - val_accuracy: 0.7732 - val_loss: 0.6310
Epoch 9/35
372/372 - 8s - 21ms/step - accuracy: 0.9560 - loss: 0.1185 - val_accuracy: 0.8038 - val_