## Sarcasm_Headlines_testV2_1D_CNN


## 1.set up

In [1]:
import keras.backend as K
from keras.models import Model
from keras.preprocessing import sequence
from keras.layers import Input, concatenate
from keras.layers import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
from keras.models import Sequential
#from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

Using TensorFlow backend.


## 2. view the dataset

In [2]:
# read the dataset and keep only required columns
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json",lines=True)
df = df[["headline","is_sarcastic"]]
df = df.drop(index = 7302)
df.head()

Unnamed: 0,headline,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1


In [12]:
import plotly as py
from plotly import graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

# Make pie chart to compare the numbers of sarcastic and not-sarcastic headlines
labels = ['Sarcastic', 'Not Sarcastic']
count_sarcastic = len(df[df['is_sarcastic']==1])
count_notsar = len(df[df['is_sarcastic']==0])
values = [count_sarcastic, count_notsar]

trace = go.Pie(labels=labels,
               values=values,
               textfont=dict(size=19, color='#FFFFFF'),
               marker=dict(
                   colors=['#FF6347', '#40E0D0'] 
               )
              )

layout = go.Layout(title = '<b>Sarcastic vs Not Sarcastic</b>')
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)


print(count_sarcastic)
print(count_notsar)
count_all = count_sarcastic + count_notsar
print(count_all)

13633
14985
28618


In [14]:
from sklearn.utils import shuffle
df1 = df[df['is_sarcastic'] == 1]
df2 = df[df['is_sarcastic'] == 0]
df2 = df2.reset_index(drop=True)
df2 = df2[:13633]
df3 = df2.append(df1)
df3 = shuffle(df3)
df3.reset_index(drop=True, inplace=True)
df3.tail()

Unnamed: 0,headline,is_sarcastic
27261,rolex unveils new diving cuckoo clock capable ...,1
27262,delta airlines counter agent assures man he wi...,1
27263,beer aisle scanned for something asshole frien...,1
27264,police department deploys fancyclothes cop,1
27265,gumption rewarded with even more work,1


In [15]:
# Make pie chart to compare the numbers of sarcastic and not-sarcastic headlines
labels = ['Sarcastic', 'Not Sarcastic']
count_sarcastic = len(df3[df3['is_sarcastic']==1])
count_notsar = len(df3[df3['is_sarcastic']==0])
values = [count_sarcastic, count_notsar]

trace = go.Pie(labels=labels,
               values=values,
               textfont=dict(size=19, color='#FFFFFF'),
               marker=dict(
                   colors=['#FF6347', '#40E0D0'] 
               )
              )

layout = go.Layout(title = '<b>Sarcastic vs Not Sarcastic</b>')
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

## 3. Text Preprocessing

### 3.1 Change the abbreviation

In [16]:
# clean text
replace_list = {r"i'm": 'i am',
                r"'re": ' are',
                r"let’s": 'let us',
                r"'s":  ' is',
                r"'ve": ' have',
                r"can't": 'can not',
                r"cannot": 'can not',
                r"shan’t": 'shall not',
                r"n't": ' not',
                r"'d": ' would',
                r"'ll": ' will',
                r"'scuse": 'excuse'}

In [17]:
def replace_text(text):
    text = text.lower()
    for s in replace_list:
        text = text.replace(s, replace_list[s])
    text = ' '.join(text.split())
    return text

df3['headline'] = df3['headline'].apply(lambda p: replace_text(p))

### 3.2 remove punctuations and digits

In [18]:
import string
from string import digits, punctuation
from pandas.core.frame import DataFrame

hl_cleaned = []
for hl in df3['headline']:
# Remove punctuations
    clean = hl.translate(str.maketrans('', '', punctuation))
# Remove digits/numbers
    clean = clean.translate(str.maketrans('', '', digits))
    hl_cleaned.append(clean)
    
# View comparison
print('Original texts :')
print(df3['headline'][13])
print('\nAfter cleaned :')
print(hl_cleaned[13])


Original texts :
'get tivo' friend is solution to everything

After cleaned :
get tivo friend is solution to everything


In [19]:
df3.he = DataFrame(hl_cleaned)
df3.he
df3['headline']= df3.he
df3['headline'][6]


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



'the predictable blowback from supporting sectarian authoritarianism in bahrain'

In [20]:
phrase_len = df3['headline'].apply(lambda p: len(p.split(' ')))
max_phrase_len = phrase_len.max()
max_phrase_len

39

In [21]:
X = df3.headline
Y = df3['is_sarcastic'].values
Y = np.vstack(Y)


### 3.3 Word embedding

In [22]:
top_words = 5000
tokenizer = Tokenizer(
    num_words = top_words,
    filters = '"#$%&()*+-/:;<=>@[\]^_`{|}~'
)

tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)


### 3.4 Pad sequences

In [23]:
X = pad_sequences(X, maxlen = max_phrase_len)

X.shape

(27266, 39)

## 4. Building the Model - single window

In [56]:
max_words = 39
batch_size = 512 
embedding_dims = 100 
filters = 250
kernel_size = 3
epochs = 5

### 4.1 Model shape

In [57]:
 # Build model
headlines = Input(batch_shape=(None, max_words), dtype='int32', name='headlines')

embedding_layer = Embedding(top_words, embedding_dims, input_length=max_words)
sent_embed = embedding_layer(headlines)

drop_layer = Dropout(0.5)
sent_drop = drop_layer(sent_embed)

conv_layer = Conv1D(filters, kernel_size, padding='valid', activation='relu')
sent_conv = conv_layer(sent_drop)

sent_pooling = GlobalMaxPooling1D()(sent_conv)
sent_repre = Dense(250)(sent_pooling)
sent_repre = Activation('relu')(sent_repre)

sent_repre = Dense(1)(sent_repre)
pred = Activation('sigmoid')(sent_repre)

model = Model(inputs=headlines, outputs=pred)

model.summary()


Model: "model_66"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
headlines (InputLayer)       (None, 39)                0         
_________________________________________________________________
embedding_67 (Embedding)     (None, 39, 100)           500000    
_________________________________________________________________
dropout_67 (Dropout)         (None, 39, 100)           0         
_________________________________________________________________
conv1d_181 (Conv1D)          (None, 37, 250)           75250     
_________________________________________________________________
global_max_pooling1d_181 (Gl (None, 250)               0         
_________________________________________________________________
dense_106 (Dense)            (None, 250)               62750     
_________________________________________________________________
activation_105 (Activation)  (None, 250)               0  

### 4.2 Train, test and evaluate

In [58]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
cvscores = []
for train, test in kfold.split(X, Y):
    
    #Build model
    embedding_layer = Embedding(top_words, embedding_dims, input_length=max_words)
    sent_embed = embedding_layer(headlines)

    drop_layer = Dropout(0.5)
    sent_drop = drop_layer(sent_embed)

    conv_layer = Conv1D(filters, kernel_size, padding='valid', activation='relu')
    sent_conv = conv_layer(sent_drop)

    sent_pooling = GlobalMaxPooling1D()(sent_conv)
    sent_repre = Dense(250)(sent_pooling)
    sent_repre = Activation('relu')(sent_repre)

    sent_repre = Dense(1)(sent_repre)
    pred = Activation('sigmoid')(sent_repre)

    model = Model(inputs=headlines, outputs=pred)
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Fit the model
    model.fit(X[train], Y[train], epochs=5, batch_size=512, verbose=1)
    
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=1)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))


Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.75%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 82.95%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.09%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.92%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.40%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.23%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.00%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 83.79%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.29%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.81%
84.52% (+/- 0.72%)


## 5. Building the Model - multi windows

In [51]:
# set parameters:
max_words = 39
batch_size = 512 
embedding_dims = 100 
filters = 250
kernel_size = 3
epochs = 5
kernel_size_list = [2, 3, 4, 5]


In [54]:
# Build model
headlines = Input(batch_shape=(None, max_words), dtype='int32', name='headlines')

embedding_layer = Embedding(top_words, embedding_dims, input_length=max_words)
sent_embed = embedding_layer(headlines)

drop_layer = Dropout(0.5)
sent_drop = drop_layer(sent_embed)

cnn_result = []
for kernel_size in kernel_size_list:
    conv_layer = Conv1D(filters, kernel_size, padding='valid', activation='relu')
    sent_conv = conv_layer(sent_drop)
    
    sent_pooling = GlobalMaxPooling1D()(sent_conv)
    cnn_result.append(sent_pooling)
cnn_result = concatenate(cnn_result)
sent_repre = Dense(250)(cnn_result)
sent_repre = Activation('relu')(sent_repre)

sent_repre = Dense(1)(sent_repre)
pred = Activation('sigmoid')(sent_repre)

model = Model(inputs=headlines, outputs=pred)

model.summary()

Model: "model_55"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
headlines (InputLayer)          (None, 39)           0                                            
__________________________________________________________________________________________________
embedding_56 (Embedding)        (None, 39, 100)      500000      headlines[0][0]                  
__________________________________________________________________________________________________
dropout_56 (Dropout)            (None, 39, 100)      0           embedding_56[0][0]               
__________________________________________________________________________________________________
conv1d_137 (Conv1D)             (None, 38, 250)      50250       dropout_56[0][0]                 
___________________________________________________________________________________________

In [55]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
cvscores = []
for train, test in kfold.split(X, Y):
    
    #Build model
    headlines = Input(batch_shape=(None, max_words), dtype='int32', name='headlines')

    embedding_layer = Embedding(top_words, embedding_dims, input_length=max_words)
    sent_embed = embedding_layer(headlines)

    drop_layer = Dropout(0.5)
    sent_drop = drop_layer(sent_embed)

    cnn_result = []
    for kernel_size in kernel_size_list:
        
        conv_layer = Conv1D(filters, kernel_size, padding='valid', activation='relu')
        sent_conv = conv_layer(sent_drop)
    
        sent_pooling = GlobalMaxPooling1D()(sent_conv)
        cnn_result.append(sent_pooling)
    cnn_result = concatenate(cnn_result)
    sent_repre = Dense(250)(cnn_result)
    sent_repre = Activation('relu')(sent_repre)
    
    sent_repre = Dense(1)(sent_repre)
    pred = Activation('sigmoid')(sent_repre)

    model = Model(inputs=headlines, outputs=pred)
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Fit the model
    model.fit(X[train], Y[train], epochs=5, batch_size=512, verbose=1)
    
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=1)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))


Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.57%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 83.72%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.75%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.78%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.22%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.11%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.62%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 83.31%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.91%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.11%
84.81% (+/- 0.76%)
