In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tensorflow as tf
import tensorflow_hub as hub
import logging
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore') 
import keras        

In [2]:
np.set_printoptions(suppress=True)
pd.set_option('display.max_columns',8000)
pd.set_option('display.max_rows',7000)

In [3]:
from google.colab import files
files.upload()

Saving Real_News.xlsx to Real_News.xlsx
Saving Fake_News.xlsx to Fake_News.xlsx


In [66]:
df=pd.read_excel('/content/Real_News.xlsx')
fake_df=pd.read_excel('/content/Fake_News.xlsx')

In [67]:
df['labels']=0
fake_df['labels']=1

In [68]:
df=df.append(fake_df)

In [69]:
df.shape

(45452, 4)

In [70]:
df.head()

Unnamed: 0,Title,News_text,Subject,labels
0,Trump judicial nominee withdraws from consider...,WASHINGTON (Reuters) - A lawyer nominated by P...,politicsNews,0
1,No. 2 Democrat in Senate calls on Franken to r...,"(Reuters) - U.S. Senator Dick Durbin, the No. ...",politicsNews,0
2,"WTO chief won't debate Trump, but rallies supp...",GENEVA (Reuters) - The head of the World Trade...,politicsNews,0
3,Hungary says it is facing 'frontal assault' fr...,BUDAPEST (Reuters) - Hungary is facing a front...,worldnews,0
4,Senate Republicans shove tax bill ahead as Dem...,WASHINGTON (Reuters) - U.S. Senate Republicans...,politicsNews,0


In [71]:
df['total_text']=df['Title']+df['News_text']

In [72]:
def cleaning(Inpdata):
    cleanedArticle1=re.sub(r'(Reuters)-[?|$|(),"".@#=><|!]Â&*/',r' ',Inpdata)
    cleanedArticle2=re.sub(r'https?://\S+|www\.\S+',r' ',cleanedArticle1)
    cleanedArticle3=re.sub(r'\b\w{1,2}\b',r' ',cleanedArticle2)
    cleanedArticle4=re.sub(r'[^a-z A-Z]',r' ',cleanedArticle3)
    cleanedArticle5=cleanedArticle4.lower()
    cleanedArticle6=re.sub(r' +',r' ',cleanedArticle5)
    return(cleanedArticle6)

In [73]:
df['text_clean']=df['total_text'].apply(cleaning)

In [74]:
df=df.drop(labels=['Title','News_text','total_text'], axis=1)

In [75]:
train,test=train_test_split(df,test_size=0.25,random_state=7506)

In [76]:
train.head()

Unnamed: 0,Subject,labels,text_clean
3995,politicsNews,0,factbox long history leakers media facing char...
1586,worldnews,0,pakistan army role focus islamists end blasphe...
9213,politics,1,lgbt volunteers aren waiting thrown off roofto...
5269,News,1,watch cnn panel floored trump lover claims th...
17513,left-news,1,black residents not happy after street artist ...


In [77]:
test.head()

Unnamed: 0,Subject,labels,text_clean
1873,worldnews,0,lebanon foreign powers urged maintain non inte...
21175,left-news,1,you gotta love this video white girl told she ...
4427,politics,1,reince priebus embarrasses snarky nbc meet the...
16471,News,1,poll shows hillary clinton with thirteen poin...
10166,US_News,1,boiler room social rejects political pessimist...


In [78]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [79]:
import tensorflow_hub as hub
from  tokenization import FullTokenizer
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2, Total size: 421.50MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.


In [80]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [81]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [82]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(2, activation='sigmoid')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [83]:
max_len = 150
train_input = bert_encode(train['text_clean'].values, tokenizer, max_len=max_len)
test_input = bert_encode(test['text_clean'].values, tokenizer, max_len=max_len)


In [84]:
train_labels = tf.keras.utils.to_categorical(train.labels.values, num_classes=2)
test_labels = tf.keras.utils.to_categorical(test.labels.values, num_classes=2)

In [85]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 150)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
keras_layer_3 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [86]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

In [88]:
train_history = model.fit(train_input, train_labels, validation_split=0.2,epochs=1,callbacks=[checkpoint, earlystopping],batch_size=32,verbose=1)



KeyboardInterrupt: ignored

In [None]:
true_label=test['labels'].values
from sklearn.metrics import plot_confusion_matrix,accuracy_score
plot_confusion_matrix(model, test_input, true_label)  
plt.show()  