In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import zipfile
with zipfile.ZipFile("../content/drive/MyDrive/fake-news.zip","r") as z:
    z.extractall(".")


In [3]:
import pandas as pd
import numpy as np

In [4]:
df= pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
df.shape

(20800, 5)

In [7]:
#as of now we will train it based on fake news on title column to train faster
#can later do on text column

In [15]:
#dropping NaN values
df= df.dropna()

In [16]:
#get the independent and dependent(target) features
x= df.drop('label', axis=1)
y= df['label']

In [17]:
x.shape

(18285, 4)

In [18]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
#since this is classification problem, therefore final layer should have 1 node which tells whether fake or not
#therefore dense layer

In [19]:
### Vocabulary size
voc_size=5000

In [20]:
#doing onehot representation of the title column
messages= x.copy()
messages.reset_index(inplace=True)
#rest index is done because NaN values were dropped

In [21]:
import nltk
import re
from nltk.corpus import stopwords

In [22]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
messages['title']

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2                        Why the Truth Might Get You Fired
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
18280    Rapper T.I.: Trump a ’Poster Child For White S...
18281    N.F.L. Playoffs: Schedule, Matchups and Odds -...
18282    Macy’s Is Said to Receive Takeover Approach by...
18283    NATO, Russia To Hold Parallel Exercises In Bal...
18284                            What Keeps the F-35 Alive
Name: title, Length: 18285, dtype: object

In [24]:
### Dataset Preprocessing in title column
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    #print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [47]:
#corpus #stemmed list

In [30]:
#now one hot representation is created
onehot_r= [one_hot(words, voc_size)for words in corpus]
#onehot_r #all the sentences are converted to their index representation

In [31]:
sent_length=20
embedded_docs=pad_sequences(onehot_r,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 4215 4543  765]
 [   0    0    0 ... 1133  420 1130]
 [   0    0    0 ... 3923  326  630]
 ...
 [   0    0    0 ... 3356 3965 3247]
 [   0    0    0 ... 4433 4422 1692]
 [   0    0    0 ... 2803  602 1225]]


In [32]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 2294,
        271, 4893,  469, 4668, 3599,  953, 4215, 4543,  765], dtype=int32)

In [54]:
# ## Creating model
# embedding_vector_features=40
# model=Sequential()
# model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
# model.add(LSTM(100))
# model.add(Dense(1,activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# print(model.summary())

In [55]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [56]:
len(embedded_docs),y.shape

(18285, (18285,))

In [57]:
x_final=np.array(embedded_docs)
y_final=np.array(y)

In [58]:
x_final.shape,y_final.shape

((18285, 20), (18285,))

In [64]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=0)

In [65]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd8f40b1b90>

In [72]:
# pred= model.predict(x_test)
# y_pred=np.argmax(pred,axis=1)  #no fn like predict_classes
y_pred = (model.predict(x_test)>0.5).astype("int32")

In [73]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[3211,  178],
       [  97, 2549]])

In [74]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9544324772162386