In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Used code from Sarcasm with RNN, Random Forest and SVM Python notebook using data from News Headlines Dataset For Sarcasm Detection

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

df1 = pd.read_json("../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json", lines = True)
df2 = pd.read_json("../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json", lines = True)


/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json
/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json


In [14]:
import regex as re
sar_acc = pd.read_json('/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json',lines=True)
#adds a column to dataset called 'source' and it uses the third "word" in article_link to identify source
sar_acc['source'] = sar_acc['article_link'].apply(lambda x: re.findall(r'\w+', x)[2])
sar_acc.head()

Unnamed: 0,is_sarcastic,headline,article_link,source
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...,theonion
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...,huffingtonpost
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...,huffingtonpost
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...,theonion
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...,theonion


In [15]:
#Getting X and Y ready
from sklearn.preprocessing import LabelEncoder
X = sar_acc.headline
Y = sar_acc.is_sarcastic
le = LabelEncoder()
#normalizes Y values and then reshapes Y to have enough rows where each value gets its own row
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [16]:
from sklearn.model_selection import train_test_split
#splits data with 20% for testing
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

In [17]:
# Tokenize the data and convert the text to sequences.
# Add padding to ensure that all the sequences have the same shape.
# There are many ways of taking the max_len and here an arbitrary length of 150 is chosen


from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [18]:
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.models import Model

def RNN():
    inputs = Input(name='inputs',shape=[max_len])
#     adding words to the layer of NN
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
#     adding LTSM to layer -- long short term memory type of RNN but covers longer distance
    layer = LSTM(64)(layer)
#     adding dense layer -- fully connected will just continue
    layer = Dense(256,name='FC1')(layer)
#     adding the relu activation function
    layer = Activation('relu')(layer)
#     adding dropout pf 20% to prevent overfitting
    layer = Dropout(0.2)(layer)
#     adding the dense layer
    layer = Dense(1,name='out_layer')(layer)
#     adding sigmoid activation function
    layer = Activation('sigmoid')(layer)
#     initializing the model of RNN based on inputs and layers
    model = Model(inputs=inputs,outputs=layer)
    return model

In [19]:
# calling the model of RNN
model = RNN()
# generating the summary of the model formed
model.summary()
# compiling the model and assigning loss function and optimizer
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 150)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_2 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257 

In [20]:
from keras.callbacks import EarlyStopping
# fitting the model
model.fit(sequences_matrix,Y_train,batch_size=100,epochs=5,
          validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5


<tensorflow.python.keras.callbacks.History at 0x7efdf8425710>

In [21]:
# creating test sequence from text
test_sequences = tok.texts_to_sequences(X_test)
# creating test sequence matrix using above created test sequence
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [22]:
# initializing accuracy matrix to store accuracies of all the models and compare them
accuracy = {}

In [23]:
# evaluation of RNN model
accr = model.evaluate(test_sequences_matrix,Y_test)
# printing the loss and accuracy of our model
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
# storing the model name and accuracy in accuracy dictionary
accuracy.update({"RNN":accr[1]})

Test set
  Loss: 0.420
  Accuracy: 0.810
