In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
news_data=pd.read_csv("C:\\Users\\Admin\\Desktop\\FakeNews\\news.csv")  ## File Location

# Pre-Processing

In [None]:
news_data.head()

In [None]:
print("Shape of the training data:{}".format(news_data.shape))

## Handling Null Values

In [None]:
news_data.isnull().sum().sum()

In [None]:
news_data['title'].isna().sum()

In [None]:
news_data.dropna(how='any',subset=['title'],inplace=True)   ## We will be processing title to check authenticity of news

In [None]:
print("The shape of the training data after the null values in title are removed:{}".format(news_data.shape))

## Checking Label Distribution

In [None]:
label=news_data['label']
label.value_counts()

In [None]:
reliable_news_count=label.value_counts()[0]
unreliable_news_count=label.value_counts()[1]

In [None]:
# print(type(y_train))
class_plot=[reliable_news_count,unreliable_news_count]
plt.pie(class_plot,labels=['Reliable Article','Unreliable Article'],shadow=True,wedgeprops={'edgecolor':'black'},autopct='%1.1f%%',startangle=90,textprops={'fontsize':18})
plt.suptitle('Label Distribution',fontsize=25)
plt.tight_layout()
plt.style.use('ggplot')
plt.show()

In [None]:
X_bar=['0','1']
plt.bar(X_bar,class_plot,color=['blue','green'])
plt.xlabel('Labels',fontsize=15)
plt.ylabel('Frequency',fontsize=15)
plt.suptitle('Class Distribution',size=30)
plt.tight_layout()
plt.grid(False)
plt.style.use('seaborn')
plt.show()

# Text PreProcessing

In [None]:
fake_news=news_data.copy()

In [None]:
news_data.reset_index(drop=True, inplace=True)

In [None]:
news_headlines=news_data['title']

In [None]:
# Inserting important NLP libraries

import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

In [None]:
count=0
ps=PorterStemmer()
news_corpus=[]
for i in range(0,len(news_headlines)):
    
    news_data=re.sub('[^a-zA-Z]',' ',news_headlines[i])
    news_data=news_data.lower()
    news_data=news_data.split()
    news_data=[ps.stem(word) for word in news_data if word not in set(stopwords.words('english'))]
    news_data=' '.join(news_data)
    news_corpus.append(news_data)
    count=count+1
    if (count%100==0):
        print("{} data headlines processed".format(count))
        pct_complete=(count/len(news_headlines))*100
        pct_complete=round(pct_complete,2)
        pct_left=100-pct_complete
        print("{}% processing left".format(pct_left))
        
    elif (count==len(news_headlines)):
        
        print("Text Preprocessing complete")
    
    
    

In [None]:
news_corpus

### Counting number of unique words

In [None]:
news_words=[]
word_count=0
for i in news_corpus:
    
    news_data=i.split()
    for j in news_data:
        
        if j not in news_words:
            
            word_count=word_count+1
            news_words.append(j)
            
        else:
            
            pass
        
print("The number of unique words in the training set: {}".format(word_count))
    

## Encoding the words

In [None]:
vocab_size=15000
oov_token = '<UNK>'
from keras.preprocessing.text import Tokenizer

In [None]:
# Tokenizing our training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(news_corpus)

In [None]:
word_dictionary = tokenizer.word_index
print(word_dictionary)

In [None]:
news_encoded_data = tokenizer.texts_to_sequences(news_corpus)

In [None]:
print(news_encoded_data)

## Padding the news input to a fixed length

In [None]:
maxlength_list=[]
for x in news_encoded_data:
    
    length_x=len(x)
    maxlength_list.append(length_x)
    
MAXLEN=max(maxlength_list)
print("Minimum Padding length required:{}".format(MAXLEN)) ## So as to fit the largest possible input from training while padding
## print(len(maxlength_list))

In [None]:
sentence_length=MAXLEN
from keras.preprocessing.sequence import pad_sequences

In [None]:
padded_news_vector=pad_sequences(news_encoded_data,padding='post',maxlen=sentence_length)
print("Padding completed for {} news inputs".format(len(padded_news_vector)))

In [None]:
print(padded_news_vector)

# Creating a Model

In [None]:
import tensorflow

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential

In [None]:
output_embedded_dims=120

In [None]:
def create_model():
    
    model=Sequential()
    model.add(Embedding(vocab_size,output_embedded_dims,input_length=sentence_length))
    model.add(Bidirectional(LSTM(units=200,return_sequences=True)))
    model.add(Dropout(rate=0.3))
    model.add(Bidirectional(LSTM(units=200,return_sequences=True)))
    model.add(Dropout(rate=0.3))
    model.add(Bidirectional(LSTM(units=200,return_sequences=True)))
    model.add(Dropout(rate=0.3))
    model.add(Bidirectional(LSTM(units=200,return_sequences=False)))
    model.add(Dense(units=200,activation='relu'))
    model.add(Dropout(rate=0.3))
    model.add(Dense(units=100,activation='relu'))
    model.add(Dropout(rate=0.3))
    model.add(Dense(units=100,activation='relu'))
    model.add(Dense(units=1,activation='sigmoid')) # Adding an output layer
    
    return model

In [None]:
news_predictor=create_model()
news_predictor.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
print(news_predictor.summary())

In [None]:
X=np.array(padded_news_vector)
y=np.array(label)

In [None]:
news_predictor.fit(X,y,batch_size=256,epochs=10,verbose=1)

## Saving model data

In [None]:
news_predictor.save('news_predictor.h5')      ## Saving the model

In [None]:
news_predictor.save_weights('news_predictor_weights.h5')   ## Saving the model weights

In [None]:
import pickle
with open('vocabulary.pkl','wb') as vocab_file:  ## Saving the vocabulary
    
    pickle.dump(news_corpus,vocab_file)

In [None]:
with open('tokenizer.pkl','wb') as token_file:
    
    pickle.dump(tokenizer,token_file)