In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
data = pd.read_csv('clickbait_data.csv')
data.shape

(32000, 2)

In [5]:
data

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


In [6]:
def lemmatizer(text):
  text = ' '.join(PorterStemmer().stem(word) for word in text.split(' '))
  text = ' '.join(WordNetLemmatizer().lemmatize(word) for word in text.split(' '))
  return text

def tokenizer(text):
  text = WordPunctTokenizer().tokenize(text)
  return text

In [7]:
def clean(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = lemmatizer(text)
    text = tokenizer(text)
    return text

In [8]:
data['clean'] = data['headline'].apply(clean)
data

Unnamed: 0,headline,clickbait,clean
0,Should I Get Bings,1,"[should, i, get, bing]"
1,Which TV Female Friend Group Do You Belong In,1,"[which, tv, femal, friend, group, do, you, bel..."
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1,"[the, new, star, war, the, forc, awaken, trail..."
3,"This Vine Of New York On ""Celebrity Big Brothe...",1,"[thi, vine, of, new, york, on, celebr, big, br..."
4,A Couple Did A Stunning Photo Shoot With Their...,1,"[a, coupl, did, a, stun, photo, shoot, with, t..."
...,...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0,"[to, make, femal, heart, flutter, in, iraq, th..."
31996,"British Liberal Democrat Patsy Calton, 56, die...",0,"[british, liber, democrat, patsi, calton, 56, ..."
31997,Drone smartphone app to help heart attack vict...,0,"[drone, smartphon, app, to, help, heart, attac..."
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0,"[netanyahu, urg, pope, benedict, in, israel, t..."


In [9]:
data.groupby('clickbait').clickbait.count()

clickbait
0    16001
1    15999
Name: clickbait, dtype: int64

In [10]:
max_features = 1000
tokenize=Tokenizer(num_words=max_features,split=' ')
tokenize.fit_on_texts(data['clean'])
X = tokenize.texts_to_sequences(data['clean'])
X = pad_sequences(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, data['clickbait'], test_size = 0.2)


In [12]:
max_features = 1000
embed_dim = 10

lstm_model = Sequential()
lstm_model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1]))
lstm_model.add(LSTM(units=50, input_shape=(X_train.shape[1], 1), activation='relu', return_sequences=False))
lstm_model.add(Dropout(0.3))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(loss = 'binary_crossentropy', optimizer='adam' , metrics = ['accuracy', 'Precision', 'Recall'])

In [13]:
lstm_model.fit(X_train, y_train, epochs = 8, batch_size=100, validation_data=(X_test, y_test))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f718285c820>

In [14]:
y_pred = lstm_model.predict(X_test)
scores = lstm_model.evaluate(X_test, y_test, verbose=0)
print("Accuracy:", scores[1])
print("Precision: ", scores[2])
print("Recall: ", scores[3])

Accuracy: 0.9618750214576721
Precision:  0.9666770696640015
Recall:  0.9577291011810303
