In [1]:
import numpy as np 
import pandas as pd
import json

In [2]:
import tqdm 
# for some fancy loading bars

In [3]:
dir = "../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json"

# Reading the json file

Json files are stored in the form of collection of dictionary objects as strings:

In [4]:
with open(dir) as f:
    for i in f:
        print(i)
        break

{"article_link": "https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5", "headline": "former versace store clerk sues over secret 'black code' for minority shoppers", "is_sarcastic": 0}



By using the json library, we can convert this string into a python dictionary

In [5]:
with open(dir) as f:
    
    for i in f:
        json_loaded = json.loads(i)
        
        for key in json_loaded:
            print(key," | ",  json_loaded[key])
            
        break
        

article_link  |  https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5
headline  |  former versace store clerk sues over secret 'black code' for minority shoppers
is_sarcastic  |  0


Now we can store the relevant information like headline and is_sarcastic as seperate entities

In [6]:
headlines = []
labels = []

with open(dir) as f:
    for i in f:
        json_loaded = json.loads(i)
        headlines.append(json_loaded['headline'])
        labels.append(json_loaded['is_sarcastic'])

print(f"Number of sentences/headlines: {len(headlines)}")
print("A random entry and its label:\n")
print(headlines[42])
print(labels[42])
# is_sarcastic = 1 for a sarcastic comment and 0 for a non sarcastic one


Number of sentences/headlines: 26709
A random entry and its label:

rescuers heroically help beached garbage back into ocean
1


# Clean the acquired data
Now its time to clean the sentences. 
This usually consists of 4 parts:
1. Remove unnecessary details like links and punctuations 
2. Convert each word to its base form
3. Convert the words to numerical tokens 
4. Pad these tokenized sequences to a fixed length 

In [7]:
import re
import nltk

### 1. Removing the links from the text
By using some Regex we can filter out the website link from the text with just one line 

In [8]:
# Example
s = "My github page is https://github.com/The-Bread please follow me "
text = re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', s, flags=re.MULTILINE)
text

'My github page is  please follow me '

### 2. Replacing words with their base forms
SnowballStemmer and tokenize from nltk helps to reduce similar words from different tenses to the same form

In [9]:
# example
s = "Buy buys . Likes. seems . Doesn't, does, easy,"

s = s.lower()
token = nltk.word_tokenize(s)
sentence = [nltk.stem.SnowballStemmer('english').stem(word) for word in token]

print(" ".join(sentence))

buy buy . like . seem . doe n't , doe , easi ,


Combine both the steps to a single function

In [10]:
def preprocess(sentence):
    sentence = re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', sentence, flags=re.MULTILINE)
    sentence = sentence.lower()
    token = nltk.word_tokenize(sentence)
    sentence = [nltk.stem.SnowballStemmer('english').stem(word) for word in token]
    return " ".join(sentence)

In [11]:
example = headlines[52]
print("Original:")
print(example)
print("\nPreprocessed: ")
print(preprocess(example))

Original:
longtime teacher retires without changing a single student's life

Preprocessed: 
longtim teacher retir without chang a singl student 's life


### Tokenize the sentence

Next step is to convert the sentences into a sequence of numbers corresponding to each word.
The numbers are given according to the frequency of that particular word 

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
S = ["Hello", "How are you today", "Where are you"]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(S)
print(f"Actual Sentence: {S[2]}\n\n")

S = tokenizer.texts_to_sequences(S)
print(f"Tokenized Sequence: {S[2]}\n\n")

print(tokenizer.word_index)


Actual Sentence: Where are you


Tokenized Sequence: [6, 1, 2]


{'are': 1, 'you': 2, 'hello': 3, 'how': 4, 'today': 5, 'where': 6}


### Pad these sequences. 
###### We can see that sentences can have varying length which might be problematic when passing through a neural network which requires a fixed input size.
###### Hence we fill the shorter sentences with 0 (since that is not reserved for any other word in the tokenizer index) and we truncate the longer sentences. We can either do these from the left side(pre) or the right side(post)

In [14]:
padded = pad_sequences(S, maxlen=3, padding='post', truncating='post')
print(f"Padded sequences:\n{padded}")

Padded sequences:
[[3 0 0]
 [4 1 2]
 [6 1 2]]


In [15]:
def tokenize_data(X):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X)
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=15, padding='post', truncating='post')
    X = np.array(X) # convert to numpy array 
    return X, tokenizer

#### Now to apply these methods to the headlines we got from the dataset

In [16]:
stemmed = [preprocess(s) for s in tqdm.tqdm(headlines, desc='Stemming the Headlines')]
print("Stemming complete")

print("\nTokenizing the headlines")
X, tokenizer = tokenize_data(stemmed)
print("Headlines Tokenized")
print(f"\nInput Shape: {X.shape}")

Y = np.array(labels)
print(f"Ouput Shape: {Y.shape}")

# Reshape Y to make it 2D like the inputs
Y = np.reshape(Y, (-1, 1))
print(f"Final Output Shape: {Y.shape}")

Stemming the Headlines: 100%|██████████| 26709/26709 [00:13<00:00, 2012.77it/s]


Stemming complete

Tokenizing the headlines
Headlines Tokenized

Input Shape: (26709, 15)
Ouput Shape: (26709,)
Final Output Shape: (26709, 1)


In [17]:
print("Number of words tokenized: ", len(tokenizer.word_index))
num_words = len(tokenizer.word_index)

Number of words tokenized:  17851


# Making the model

Our goal is to make a model which will input a sentence, and output whether its sarcastic or not.
##### For working on text based models , a special type of networks are used called RNN(Recurrent Neural Networks). 
More info about RNNs can be found [here](https://towardsdatascience.com/understanding-rnn-and-lstm-f7cdf6dfc14e). Alternatively, you can enroll in [this course](https://www.coursera.org/learn/natural-language-processing-tensorflow/) by DeepLearning.ai on coursera (Recommended).
##### We will be using LSTM version of RNNs with Conv1D layers to make a simple classifier

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, GlobalAveragePooling1D, Conv1D, Embedding

### The Embedding layer:
The Embedding layer is responsible for making sense out of the numbers assigned to the words. This layer maps each token as an N-dimensional vector. Similar vectors point near the same location on some dimension and might point away from each other on higher dimensions. 

For example : words like cat, dog and pug might be located closely but on higher dimensions, cat might be farther from dog than the word pug (since it is a type of dog).

If you want to build a highly accurate model, you would use the pretrained vectors for these words. Some examples are GloVe which offers vectors for a lot of commonly used words. They have also provided us with vectors with different Embedding Dimensions (50d, 100d, 200d, etc) 

In [19]:
model = Sequential([
    Embedding(num_words+1, output_dim=10, input_length=15),
    LSTM(15, dropout=0.1, return_sequences=True),
    Conv1D(15, 10, activation='relu'),
    GlobalAveragePooling1D(),
    
    Dense(32,activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics = ['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 10)            178520    
_________________________________________________________________
lstm (LSTM)                  (None, 15, 15)            1560      
_________________________________________________________________
conv1d (Conv1D)              (None, 6, 15)             2265      
_________________________________________________________________
global_average_pooling1d (Gl (None, 15)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                512       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 182,890
Trainable params: 182,890
Non-trainable params: 0
__________________________________________________

In [20]:
model.fit(X, Y, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f776d47bed0>

# What next?
### We overlooked the dataset exploration. Check for dataset imbalance and try to fix it using oversampling or undersampling 
### Divide the data into training and test data or perform cross validation

### Test out different models and compare their validation accuracy and loss 