In [1]:
import json
import tensorflow as tf
import requests
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
#tensorflow is an open source library developed by google
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-01-15 08:43:24.919895: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True)
#read the dataset

In [3]:
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
data.drop(['article_link'], axis=1)
#drop the axis column as it is not necessary

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


In [5]:
sentence = data['headline'].tolist()
label = data['is_sarcastic'].tolist()
#convert the dataframe to list to access them easily

In [6]:
sentence[0:10]
#first 10 headlines

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages"]

In [7]:
label[0:10]
#first 10 labels 1- sarcastic, 0- not sarcastic

[0, 0, 1, 1, 0, 0, 0, 0, 1, 0]

### Splitting the dataset

In [8]:
train_size = round(len(sentence)*0.75)
# 75% data is used for training remaining 25% for testing
train_sen = sentence[0:train_size]
test_sen = sentence[train_size : ]
train_lab = label[0:train_size]
test_lab = label[train_size : ]

In [9]:
vocab_size = 10000
oov_tok = "oov"

### Tokenize

In [10]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token = oov_tok)
#num_words - the maximum no. of words to keep, only the most common num_words - 1 will be kept
#oov_tok - If given, it will be added to word_index and used to replace out-of-vocabulary words during text_to_sequence calls
tokenizer.fit_on_texts(train_sen)
word_index = tokenizer.word_index
#index of the tokens in dictionary format

### Padding

In [11]:
max_length = 100 #max length of a sentence can be 100, if not provided then the length of the longest sent.
trunc_type = 'post'
padding_type = 'post'

In [12]:
training_sequences = tokenizer.texts_to_sequences(train_sen)
# assigning sequences , numbers to tokens as model can only train on numbers
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# in any raw text data, naturally there will be sentences of difeerent lenths. all nn require th same input size
testing_sequences = tokenizer.texts_to_sequences(test_sen)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [13]:
embedding_dim = 16
#Sequential models are the machine learning models that input or output sequences of data, sequential data
#includes text streams, audio clips, video clips etc.
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    #input_dim - Size of the vocab
    #output_dim - the no. of dimensions we want to embed into, each word will be represented in this much dimension
    #input_length - length of the maximum document
    tf.keras.layers.GlobalAveragePooling1D(),
    #it adds up the vectors
    tf.keras.layers.Dense(24, activation='relu'),
    # an activation fynction in nn defines how the weighted sum of the inputs is transformed into the output from a node or 
    #nodes in a layer of the network
    # 24 - dimension of the output spa ce
    # relu - rectified linear unit
    # it is an activation function if the input is positive then the output will be the i/p else zero
    tf.keras.layers.Dense(1, activation='sigmoid')])
    # takes any real values as input and outputs in the range of 0 to 1, the larger the input closer will it be to 1 
    # and vice versa
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#binary_crossentropy - loss function used for binary stuffs
#optimizer - adam - used to change the weights of the attritubutes of the nn to reduce losses
model.summary()
# embedding - batch_size, input_length, output_dim
#gloavg - batch_size,features

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


2023-01-15 08:43:29.381126: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
training_padded = np.array(training_padded)
training_labels = np.array(train_lab)
testing_padded = np.array(testing_padded)
testing_labels = np.array(test_lab)
#need to convert the lists to arrays for Tensorflow 2.x

In [15]:
#Training the model
#An epoch in a neural network is the training of the neural network with all the training data for one cycle
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)
#verbose = 2 means one line per epoch

Epoch 1/30
626/626 - 1s - loss: 0.6745 - accuracy: 0.5648 - val_loss: 0.6211 - val_accuracy: 0.6322 - 1s/epoch - 2ms/step
Epoch 2/30
626/626 - 1s - loss: 0.4655 - accuracy: 0.8157 - val_loss: 0.3992 - val_accuracy: 0.8341 - 675ms/epoch - 1ms/step
Epoch 3/30
626/626 - 1s - loss: 0.3277 - accuracy: 0.8696 - val_loss: 0.3596 - val_accuracy: 0.8501 - 681ms/epoch - 1ms/step
Epoch 4/30
626/626 - 1s - loss: 0.2740 - accuracy: 0.8933 - val_loss: 0.3455 - val_accuracy: 0.8549 - 678ms/epoch - 1ms/step
Epoch 5/30
626/626 - 1s - loss: 0.2378 - accuracy: 0.9052 - val_loss: 0.3420 - val_accuracy: 0.8541 - 671ms/epoch - 1ms/step
Epoch 6/30
626/626 - 1s - loss: 0.2118 - accuracy: 0.9171 - val_loss: 0.3478 - val_accuracy: 0.8582 - 678ms/epoch - 1ms/step
Epoch 7/30
626/626 - 1s - loss: 0.1873 - accuracy: 0.9292 - val_loss: 0.3568 - val_accuracy: 0.8570 - 691ms/epoch - 1ms/step
Epoch 8/30
626/626 - 1s - loss: 0.1700 - accuracy: 0.9355 - val_loss: 0.3634 - val_accuracy: 0.8550 - 670ms/epoch - 1ms/step
Epo

In [16]:
sentence = ["Coworkers At Bathroom sink locked in Tense Standoff Over Who is Going to wash hands longer", 
            "Spiking U.S. coronavirus cases could force rationing decisions similar to those made in Italy, China."]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

[[0.9996512 ]
 [0.01750244]]
