In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
#from gensim.summarization import keywords
#import preprocessor as p
import texthero as hero
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
import tensorflow_text as text

import tensorflow as tf

%matplotlib inline

### Load Dataset

In [2]:
df = pd.read_csv('clickbait/clickbait_data.csv')
df = df.sample(frac = 1,random_state = 30)

In [3]:
df.head()

Unnamed: 0,headline,clickbait
5941,"How Well Do You Remember ""The Nightmare Before...",1
684,"Which Is Better: ""Hocus Pocus"" Or ""Halloweentown""",1
12472,Tell Us About Yourself(ie): Luke Bryan,1
18616,NASA's Mars rovers exceed all expectations,0
10067,19 Adorable Lingerie Sets To Wear On Valentine...,1


In [4]:
#Use texthero to clean data in one go

df['headline'] = hero.clean(df['headline'])
df['headline'][0:4]

5941        well remember nightmare christmas
684          better hocus pocus halloweentown
12472                   tell us ie luke bryan
18616    nasa mars rovers exceed expectations
Name: headline, dtype: object

In [5]:
def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['clickbait'], test_size=0.25, random_state=42)

In [7]:
y_train.value_counts()

1    12015
0    11985
Name: clickbait, dtype: int64

In [8]:
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [9]:
bert_preprocess = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

In [10]:
def embedding(text):
    preprocessed = bert_preprocess(text)
    return bert_encoder(preprocessed)['pooled_output']

In [11]:
embedding([X_train[0]])

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-0.7772485 , -0.3398778 , -0.3659275 ,  0.58366   ,  0.31049612,
        -0.10770843,  0.70381814,  0.2029457 , -0.2316289 , -0.99983853,
        -0.38446948,  0.60503167,  0.97788596,  0.07288127,  0.86847544,
        -0.4598918 , -0.1169603 , -0.5443887 ,  0.20859379, -0.37066644,
         0.56139916,  0.99768436,  0.19245009,  0.2572987 ,  0.5108195 ,
         0.83992726, -0.56694853,  0.8888596 ,  0.94153   ,  0.6475518 ,
        -0.54680526,  0.1072831 , -0.9885101 , -0.09085421, -0.5295646 ,
        -0.98854864,  0.31841645, -0.68289536,  0.03286978,  0.169518  ,
        -0.88812697,  0.20929658,  0.9995327 , -0.05376754,  0.17120354,
        -0.1565725 , -0.9999703 ,  0.21230058, -0.84628356,  0.1325828 ,
         0.43749017,  0.08387292,  0.15171152,  0.3945125 ,  0.3373591 ,
         0.05389988, -0.2250249 ,  0.11121577, -0.13990155, -0.5157387 ,
        -0.6130257 ,  0.40843183, -0.658992  , -0.84319955,  0.5922122 ,
 

In [12]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
dropout_layer = tf.keras.layers.Dropout(0.1,name = 'dropout')(outputs['pooled_output'])
op = tf.keras.layers.Dense(1,activation = 'sigmoid', name = 'output')(dropout_layer)

model = tf.keras.Model(inputs = [text_input],outputs = op)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_mask': (None 0           text[0][0]                       
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'default': (None, 7 109482241   keras_layer[0][0]                
                                                                 keras_layer[0][1]                
                                                                 keras_layer[0][2]                
______________________________________________________________________________________________

In [18]:
model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

#callbacks = [EarlyStopping(monitor = 'val_accuracy',)]

In [19]:
history = model.fit(X_train,y_train,epochs = 1)

 23/750 [..............................] - ETA: 56:22 - loss: 0.5738 - accuracy: 0.7215

KeyboardInterrupt: 