In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
true = pd.read_csv('/content/true.csv')
fake = pd.read_csv('/content/fake.csv')

true['Class'] = "true"
fake['Class'] = "fake"

data = pd.concat([true, fake], ignore_index=True)

print(data.head())
print(data.isnull().sum())

df = data.dropna()


                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date Class  
0  December 31, 2017   true  
1  December 29, 2017   true  
2  December 31, 2017   true  
3  December 30, 2017   true  
4  December 29, 2017   true  
title      0
text       0
subject    0
date       0
Class     

In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')
def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

X = df['processed_text']
y = df['Class']

y = y.map({'true': 0, 'fake': 1})


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_pad = pad_sequences(X_train_vec.toarray(), maxlen=100)
X_test_pad = pad_sequences(X_test_vec.toarray(), maxlen=100)

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/3




[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 211ms/step - accuracy: 0.5195 - loss: 0.6925 - val_accuracy: 0.5178 - val_loss: 0.6927
Epoch 2/3
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 208ms/step - accuracy: 0.5213 - loss: 0.6925 - val_accuracy: 0.5178 - val_loss: 0.6926
Epoch 3/3
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 209ms/step - accuracy: 0.5248 - loss: 0.6922 - val_accuracy: 0.5178 - val_loss: 0.6926


<keras.src.callbacks.history.History at 0x7d7749a970d0>

In [8]:
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 40ms/step - accuracy: 0.5269 - loss: 0.6917
Test Accuracy: 51.78%
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 34ms/step
Confusion Matrix:
[[   0 4330]
 [   0 4650]]


In [15]:
sample_article = ["Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and the very dishonest fake news media. The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media"]
sample_article_vec = vectorizer.transform(sample_article)
sample_article_pad = pad_sequences(sample_article_vec.toarray(), maxlen=100)
prediction = np.argmax(model.predict(sample_article_pad))
print(f"Prediction is ",prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Prediction is  0
