In [15]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package stopwords to C:\Users\Nitin
[nltk_data]     Saxena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Nitin
[nltk_data]     Saxena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
true = pd.read_csv(r"C:\Users\Nitin Saxena\Downloads\archive (9)\true.csv")
fake = pd.read_csv(r"C:\Users\Nitin Saxena\Downloads\archive (9)\fake.csv")

true['Class'] = "true"
fake['Class'] = "fake"

data = pd.concat([true, fake], ignore_index=True)

display(data.head())
display(data.isnull().sum())

df = data.dropna()


Unnamed: 0,title,text,subject,date,Class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


title      0
text       0
subject    0
date       0
Class      0
dtype: int64

In [17]:
nltk.download('punkt')
nltk.download('punkt_tab')
def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

data['processed_text'] = data['text'].apply(preprocess_text)

# Split data into features and labels
X = data['processed_text']
y = data['Class']
y = y.map({'true': 0, 'fake': 1})

[nltk_data] Downloading package punkt to C:\Users\Nitin
[nltk_data]     Saxena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Nitin
[nltk_data]     Saxena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_pad = pad_sequences(X_train_vec.toarray(), maxlen=100)
X_test_pad = pad_sequences(X_test_vec.toarray(), maxlen=100)

model_LSTM = Sequential()
model_LSTM.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
# model_LSTM.add(SpatialDropout1D(0.2))
model_LSTM.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_LSTM.add(Dense(1, activation='sigmoid'))

model_LSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_LSTM.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_data=(X_test_pad, y_test))



Epoch 1/3
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 92ms/step - accuracy: 0.5221 - loss: 0.6925 - val_accuracy: 0.5178 - val_loss: 0.6925
Epoch 2/3
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 101ms/step - accuracy: 0.5261 - loss: 0.6922 - val_accuracy: 0.5178 - val_loss: 0.6927
Epoch 3/3
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 89ms/step - accuracy: 0.5230 - loss: 0.6924 - val_accuracy: 0.5178 - val_loss: 0.6927


<keras.src.callbacks.history.History at 0x18bc9bfc810>

In [19]:
loss, accuracy = model_LSTM.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

y_pred_LSTM = model_LSTM.predict(X_test_pad)
y_pred_LSTM = (y_pred_LSTM > 0.5).astype(int)
conf_matrix_LSTM = confusion_matrix(y_test, y_pred_LSTM)
print('Confusion Matrix:')
print(conf_matrix_LSTM)

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.5269 - loss: 0.6917
Test Accuracy: 51.78%
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step
Confusion Matrix:
[[   0 4330]
 [   0 4650]]


In [20]:
sample_article = ["this news is Fake"]
sample_article_vec = vectorizer.transform(sample_article)
sample_article_pad = pad_sequences(sample_article_vec.toarray(), maxlen=100)
prediction = np.argmax(model_LSTM.predict(sample_article_pad))
print(f"Prediction is ",prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Prediction is  0


In [21]:
import pickle
with open('fake_news_model.pkl', 'wb') as model_file:
    pickle.dump(model_LSTM, model_file)