# Sarcasm Detection

In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
df = pd.read_json('sarcasm.json',lines=True)

In [4]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [5]:
sentences = []
labels = []

for i in range(df.shape[0]):
  sentences.append(df.iloc[i,1])
  labels.append(df.iloc[i,0])

In [6]:
sentences[0:5]

['thirtysomething scientists unveil doomsday clock of hair loss',
 'dem rep. totally nails why congress is falling short on gender, racial equality',
 'eat your veggies: 9 deliciously different recipes',
 'inclement weather prevents liar from getting to work',
 "mother comes pretty close to using word 'streaming' correctly"]

In [7]:
labels[0:5]

[1, 0, 0, 1, 1]

In [28]:
np.unique(labels)

array([0, 1])

In [8]:
train_size = int(len(sentences)*0.8)
print(train_size)

22895


In [9]:
train_sentences = sentences[0:train_size]
train_labels = labels[0:train_size]
test_sentences = sentences[train_size:]
test_labels = labels[train_size:]

In [10]:
tokenizer = Tokenizer(num_words = 10000, oov_token='<UKW>')
tokenizer.fit_on_texts(train_sentences)

In [11]:
word_index = tokenizer.word_index

In [13]:
#print(word_index)

In [14]:
train_seq = np.array(pad_sequences(tokenizer.texts_to_sequences(train_sentences),
                          maxlen=50, 
                          padding='post',
                          truncating='post'))

In [15]:
test_seq = np.array(pad_sequences(tokenizer.texts_to_sequences(test_sentences),
                         maxlen=50,
                         padding='post', 
                         truncating='post'))

In [20]:
#word_index

In [21]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense, GlobalAveragePooling1D

In [24]:
model = Sequential()
model.add(Embedding(10000, # vocabulary size
                    16, #Output Dim
                    input_length=50))
model.add(GlobalAveragePooling1D())
model.add(Dense(128,activation ='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(32,activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 16)            160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 128)               2176      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                        

In [26]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.003), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [27]:
model.fit(train_seq, 
          train_labels, 
          validation_data=(test_seq, test_labels),
          epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff31027d2d0>