## Sarcasm_Headlines_testV2_RNN - LSTM


## 1.set up

In [1]:
#set up
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM

os.environ['KMP_DUPLICATE_LIB_OK']='True'

Using TensorFlow backend.


## 2. view the dataset

In [2]:
# read the dataset and keep only required columns
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json",lines=True)
df = df[["headline","is_sarcastic"]]
df = df.drop(index = 7302)
df.head()

Unnamed: 0,headline,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1


In [5]:
pip install plotly

Collecting plotly
[?25l  Downloading https://files.pythonhosted.org/packages/15/90/918bccb0ca60dc6d126d921e2c67126d75949f5da777e6b18c51fb12603d/plotly-4.6.0-py2.py3-none-any.whl (7.1MB)
[K     |████████████████████████████████| 7.2MB 14kB/s eta 0:00:018
[?25hCollecting retrying>=1.3.3 (from plotly)
  Downloading https://files.pythonhosted.org/packages/44/ef/beae4b4ef80902f22e3af073397f079c96969c69b2c7d52a57ea9ae61c9d/retrying-1.3.3.tar.gz
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-cp37-none-any.whl size=11429 sha256=8a8c499069dd95d6145d152317bbfd361387ed73875af376fd4a63e99e1ddbd2
  Stored in directory: /Users/qiaoyue/Library/Caches/pip/wheels/d7/a9/33/acc7b709e2a35caa7d4cae442f6fe6fbf2c43f80823d46460c
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.6.0 retrying-1.3.3
Note: you may need to restart the ker

In [4]:
import plotly as py
from plotly import graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

# Make pie chart to compare the numbers of sarcastic and not-sarcastic headlines
labels = ['Sarcastic', 'Not Sarcastic']
count_sarcastic = len(df[df['is_sarcastic']==1])
count_notsar = len(df[df['is_sarcastic']==0])
values = [count_sarcastic, count_notsar]

trace = go.Pie(labels=labels,
               values=values,
               textfont=dict(size=19, color='#FFFFFF'),
               marker=dict(
                   colors=['#FF6347', '#40E0D0'] 
               )
              )

layout = go.Layout(title = '<b>Sarcastic vs Not Sarcastic</b>')
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)


print(count_sarcastic)
print(count_notsar)
count_all = count_sarcastic + count_notsar
print(count_all)

13633
14985
28618


In [5]:
from sklearn.utils import shuffle
df1 = df[df['is_sarcastic'] == 1]
df2 = df[df['is_sarcastic'] == 0]
df2 = df2.reset_index(drop=True)
df2 = df2[:13633]
df3 = df2.append(df1)
df3 = shuffle(df3)
df3.reset_index(drop=True, inplace=True)
df3.tail()

Unnamed: 0,headline,is_sarcastic
27261,harried woman on train quickly doing plastic s...,1
27262,lester holt begins debate by reminding audienc...,1
27263,selena gomez hits the beach in a bikini,0
27264,group of good-looking people all headed toward...,1
27265,trumpcare scored so badly it could actually he...,0


In [6]:
# Make pie chart to compare the numbers of sarcastic and not-sarcastic headlines
labels = ['Sarcastic', 'Not Sarcastic']
count_sarcastic = len(df3[df3['is_sarcastic']==1])
count_notsar = len(df3[df3['is_sarcastic']==0])
values = [count_sarcastic, count_notsar]

trace = go.Pie(labels=labels,
               values=values,
               textfont=dict(size=19, color='#FFFFFF'),
               marker=dict(
                   colors=['#FF6347', '#40E0D0'] 
               )
              )

layout = go.Layout(title = '<b>Sarcastic vs Not Sarcastic</b>')
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

## 3. Text Preprocessing

### 3.1 Change the abbreviation

In [7]:
# clean text
replace_list = {r"i'm": 'i am',
                r"'re": ' are',
                r"let’s": 'let us',
                r"'s":  ' is',
                r"'ve": ' have',
                r"can't": 'can not',
                r"cannot": 'can not',
                r"shan’t": 'shall not',
                r"n't": ' not',
                r"'d": ' would',
                r"'ll": ' will',
                r"'scuse": 'excuse'}

In [8]:
def replace_text(text):
    text = text.lower()
    for s in replace_list:
        text = text.replace(s, replace_list[s])
    text = ' '.join(text.split())
    return text

df3['headline'] = df3['headline'].apply(lambda p: replace_text(p))
df3['headline'].tail()

27261    harried woman on train quickly doing plastic s...
27262    lester holt begins debate by reminding audienc...
27263              selena gomez hits the beach in a bikini
27264    group of good-looking people all headed toward...
27265    trumpcare scored so badly it could actually he...
Name: headline, dtype: object

### 3.2 remove punctuations and digits

In [9]:
import string
from string import digits, punctuation
from pandas.core.frame import DataFrame

hl_cleaned = []
for hl in df3['headline']:
# Remove punctuations
    clean = hl.translate(str.maketrans('', '', punctuation))
# Remove digits/numbers
    clean = clean.translate(str.maketrans('', '', digits))
    hl_cleaned.append(clean)
    
# View comparison
print('Original texts :')
print(df3['headline'][5])
print('\nAfter cleaned :')
print(hl_cleaned[5])

Original texts :
are we meeting the needs of our nation is rich?

After cleaned :
are we meeting the needs of our nation is rich


In [10]:
df3.he = DataFrame(hl_cleaned)
df3.he
df3['headline']= df3.he
df3['headline'][6]
df3.head()


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



Unnamed: 0,headline,is_sarcastic
0,inside america rates the skin colors,1
1,the case for collective impact strategies on t...,0
2,kavanaugh surprised senate not questioning fac...,1
3,mistakes to avoid during your wedding night,0
4,sea claims flipflop,1


In [11]:
phrase_len = df3['headline'].apply(lambda p: len(p.split(' ')))
max_phrase_len = phrase_len.max()
max_phrase_len

39

In [12]:
X = df3.headline
Y = df3['is_sarcastic'].values
Y = np.vstack(Y)

#from sklearn.model_selection import train_test_split
#X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3)

#df_train = pd.DataFrame(X_train)
#df_test = pd.DataFrame(X_test)

### 3.3 Word embedding

In [13]:
max_words = 5000
tokenizer = Tokenizer(
    num_words = max_words,
    filters = '"#$%&()*+-/:;<=>@[\]^_`{|}~'
)

tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

#X_test = tokenizer.texts_to_sequences(X_test)

### 3.4 Pad sequences

In [14]:
X = pad_sequences(X, maxlen = max_phrase_len)
#X_test = pad_sequences(X_test, maxlen = max_phrase_len)

X.shape

(27266, 39)

## 4. Building the Model

In [17]:
model = Sequential()

model.add(Embedding(input_dim = max_words, output_dim = 100, input_length = max_phrase_len))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))

model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 39, 100)           500000    
_________________________________________________________________
lstm_12 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 101       
Total params: 580,501
Trainable params: 580,501
Non-trainable params: 0
_________________________________________________________________


In [18]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
cvscores = []
for train, test in kfold.split(X, Y):
  # create model
    model = Sequential()
    model.add(Embedding(input_dim = max_words, output_dim = 100, input_length = max_phrase_len))
    model.add(LSTM(100))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Fit the model
    model.fit(X[train], Y[train], epochs=5, batch_size=512, verbose=1)
    
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
    
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))


Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.12%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.93%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 83.21%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.88%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.74%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.80%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 83.75%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 84.74%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.11%



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy: 85.51%
84.88% (+/- 0.80%)


In [19]:
model.save_weights('rnn.h5')

model = Sequential()
model.add(Embedding(input_dim = max_words, output_dim = 100))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))

# Load the model from disk later using:
model.load_weights('rnn.h5')
