# Fake News Classifier


In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('news.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [6]:
import numpy as np
X = df.drop('label', axis=1)
y = np.array([1 if label == 'REAL' else 0 for label in df['label']])
y

array([0, 0, 1, ..., 0, 1, 1])

In [7]:
X.shape

(6335, 3)

In [8]:
y.shape

(6335,)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
import tensorflow as tf
print(tf.__version__)

2.19.0


In [12]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [13]:
#Vocabulary_size
voc_size = 5000

# One Hot Representation

In [14]:
message = X_train.copy()
message.reset_index(inplace=True)

message_test = X_test.copy()
message_test.reset_index(inplace=True)

In [15]:
message.head()

Unnamed: 0.1,index,Unnamed: 0,title,text
0,5593,4857,Ex-Defense Chief Gates: Trump 'Beyond Repair' ...,The next president is most likely to face an i...
1,6067,9885,Officials State New Clinton Emails Discovered ...,Following news of FBI Director James Comey’s d...
2,3026,6681,West covering up crimes of Bahraini regime: An...,Interviews A protester throws a glass bottle c...
3,4385,9306,Will it be representative government or thugoc...,Will it be representative government or thugoc...
4,120,2232,Obama's terror strategy is failing: US must he...,It is increasingly apparent that the U.S. war ...


In [16]:
message_test.head()

Unnamed: 0.1,index,Unnamed: 0,title,text
0,1357,9957,"American Dream, Revisited",Will Trump pull a Brexit times ten? What would...
1,2080,7596,Clintons Are Under Multiple FBI Investigations...,Clintons Are Under Multiple FBI Investigations...
2,2718,8905,The FBI Can’t Actually Investigate a Candidate...,Dispatches from Eric Zuesse This piece is cros...
3,812,8752,Confirmed: Public overwhelmingly (10-to-1) say...,Print \n[Ed. – Every now and then the facade c...
4,4886,7804,Nanny In Jail After Force Feeding Baby To Death,Nanny In Jail After Force Feeding Baby To Deat...


In [17]:
message['title'][1]

'Officials State New Clinton Emails Discovered as Part of Anthony Weiner ‘Sexting’ Investigation'

In [18]:
import nltk
import re
from nltk.corpus import stopwords

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
#Data Preprocessing of train data
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(message)):
  review = re.sub('[^a-zA-Z]', ' ', message['title'][i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [20]:
#Data Preprocessing of test data
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus_test = []
for i in range(0, len(message_test)):
  review = re.sub('[^a-zA-Z]', ' ', message_test['title'][i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus_test.append(review)

In [24]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
onehot_repr_test = [one_hot(words, voc_size) for words in corpus_test]

# Word Embedding

In [25]:
sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
embedded_docs_test = pad_sequences(onehot_repr_test, padding='pre', maxlen=sent_length)

# LSTM Model Building

In [26]:
features = 40
model = Sequential()
model.add(Embedding(voc_size, features, input_length = sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



None


In [27]:
import numpy as np
X_train = np.array(embedded_docs)
y_train = np.array(y_train)

X_test = np.array(embedded_docs_test)
y_test = np.array(y_test)

X_test.shape, y_test.shape

((2091, 20), (2091,))

In [28]:
X_test

array([[   0,    0,    0, ..., 3508, 3723, 4681],
       [   0,    0,    0, ...,  330, 2946,  845],
       [   0,    0,    0, ..., 2027, 2977, 2169],
       ...,
       [   0,    0,    0, ..., 3659, 3446, 3338],
       [   0,    0,    0, ..., 4022, 4474, 4228],
       [   0,    0,    0, ..., 3516, 2753, 3840]], dtype=int32)

# Model Training

In [29]:
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 10, batch_size = 64)

Epoch 1/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 53ms/step - accuracy: 0.5636 - loss: 0.6800 - val_accuracy: 0.7700 - val_loss: 0.5428
Epoch 2/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 66ms/step - accuracy: 0.8185 - loss: 0.4329 - val_accuracy: 0.7920 - val_loss: 0.4610
Epoch 3/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.8897 - loss: 0.2728 - val_accuracy: 0.7853 - val_loss: 0.4967
Epoch 4/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.9192 - loss: 0.2095 - val_accuracy: 0.7872 - val_loss: 0.5101
Epoch 5/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.9478 - loss: 0.1575 - val_accuracy: 0.7834 - val_loss: 0.6087
Epoch 6/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.9470 - loss: 0.1478 - val_accuracy: 0.7647 - val_loss: 0.6462
Epoch 7/10
[1m67/67[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7f4ec7fb54c0>

# Adding Dropout

In [35]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])



In [36]:
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 10, batch_size = 64)

Epoch 1/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 57ms/step - accuracy: 0.5456 - loss: 0.6864 - val_accuracy: 0.6815 - val_loss: 0.5911
Epoch 2/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.7744 - loss: 0.4906 - val_accuracy: 0.7776 - val_loss: 0.4712
Epoch 3/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 61ms/step - accuracy: 0.8609 - loss: 0.3242 - val_accuracy: 0.7848 - val_loss: 0.4988
Epoch 4/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 0.9064 - loss: 0.2347 - val_accuracy: 0.7901 - val_loss: 0.4990
Epoch 5/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - accuracy: 0.9361 - loss: 0.1811 - val_accuracy: 0.7800 - val_loss: 0.5941
Epoch 6/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 61ms/step - accuracy: 0.9385 - loss: 0.1623 - val_accuracy: 0.7747 - val_loss: 0.7273
Epoch 7/10
[1m67/67[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7f4ec7fc0bc0>

# Perfrmance Matrix & Accuracy

In [37]:
y_pred = model.predict(X_test)

[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step


In [38]:
y_pred = np.where(y_pred > 0.5, 1, 0)

In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7733142037302726