Dataset : https://www.kaggle.com/c/fake-news/data#

In [10]:
import pandas as pd

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
# make x and y
df.shape

(20800, 5)

In [5]:
# check there is null values
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
df.dropna(inplace=True)

#### Split dataset independent varibles and dependent variable

In [7]:
X = df.drop('label', axis=1)

In [8]:
## Get the Dependent features
y=df['label']

In [9]:
X.shape, y.shape

((18285, 4), (18285,))

### Onehot Representation

In [12]:
messages=X.copy()
messages.reset_index(inplace=True)

##### Use nltk for data preprocessing

In [13]:
import nltk
import re
from nltk.corpus import stopwords

get nltk stop words

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

Import libararies

In [16]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

#### Get One hot representation

In [17]:
### Vocabulary size
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 

The code provided is used for padding sequences of integers (which represent words in one-hot encoded form) to ensure that all sequences have the same length. This is a common step in preparing textual data for input into models such as Recurrent Neural Networks (RNNs), Long Short-Term Memory networks (LSTMs), or other neural networks that expect inputs of a uniform length.

In [20]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 3475 2662 2472]
 [   0    0    0 ...  842 2206 1013]
 [   0    0    0 ...  864  420 2721]
 ...
 [   0    0    0 ... 2437 2006  471]
 [   0    0    0 ... 1390 4517 1657]
 [   0    0    0 ... 2293  535   80]]


In [21]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1157,
       3946, 1030, 3710, 4776, 2512,  925, 3475, 2662, 2472])

In [23]:


# Model parameters
embedding_vector_features = 40

# Building the model
model = Sequential()

# Embedding layer (input_length is removed)
model.add(Embedding(input_dim=voc_size, output_dim=embedding_vector_features))

# LSTM layer with 100 units
model.add(LSTM(100))

# Output layer with a single neuron and sigmoid activation (for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Compiling the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Printing the model summary
print(model.summary())


None


In [24]:
len(embedded_docs),y.shape

(18285, (18285,))

In [25]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=42)

In [28]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=30,batch_size=64)

Epoch 1/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.8034 - loss: 0.4120 - val_accuracy: 0.9163 - val_loss: 0.2010
Epoch 2/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9510 - loss: 0.1352 - val_accuracy: 0.9121 - val_loss: 0.1975
Epoch 3/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9681 - loss: 0.0927 - val_accuracy: 0.9178 - val_loss: 0.1966
Epoch 4/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9771 - loss: 0.0678 - val_accuracy: 0.9178 - val_loss: 0.2207
Epoch 5/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9839 - loss: 0.0509 - val_accuracy: 0.9192 - val_loss: 0.3104
Epoch 6/30
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9893 - loss: 0.0363 - val_accuracy: 0.9160 - val_loss: 0.3050
Epoch 7/30
[1m200/200

<keras.src.callbacks.history.History at 0x167bd9dcdd0>

In [36]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")

[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [37]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[2844,  263],
       [ 214, 2165]], dtype=int64)

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9130514035727306