# Fake News detection

## Load Dataset

In [1]:
import pandas as pd

fake_df = pd.read_csv("Fake.csv")
true_df = pd.read_csv("True.csv")

fake_df["label"] = 1
true_df["label"] = 0

df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())
print(df["label"].value_counts())

                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0  
2  September 27, 2017       0  
3         May 22, 2017      1  
4       June 24, 2016       0  
label
1    23481
0    21417
Name: count, dtype: in

## Clean the Data

In [2]:
!pip install nltk



In [3]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'\W', ' ', str(text))      
    text = re.sub(r'\s+', ' ', text)          
    text = text.lower()                       
    text = ' '.join(word for word in text.split() if word not in stop_words)  
    return text

df['CleanText'] = df['text'].apply(clean_text)

print(df[['text', 'CleanText']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mbuzii\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  \
0  21st Century Wire says Ben Stein, reputable pr...   
1  WASHINGTON (Reuters) - U.S. President Donald T...   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...   
3  On Monday, Donald Trump once again embarrassed...   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...   

                                           CleanText  
0  21st century wire says ben stein reputable pro...  
1  washington reuters u president donald trump re...  
2  reuters puerto rico governor ricardo rossello ...  
3  monday donald trump embarrassed country accide...  
4  glasgow scotland reuters u presidential candid...  


## Split the Data

In [4]:
from sklearn.model_selection import train_test_split

X = df['CleanText'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))

Training samples: 35918
Testing samples: 8980


## Tokenization and Padding

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

print("X_train_pad shape:", X_train_pad.shape)
print("X_test_pad shape:", X_test_pad.shape)

X_train_pad shape: (35918, 100)
X_test_pad shape: (8980, 100)


## Build the LSTM Model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Build the model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))  

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()



## Train the Model

In [None]:
history = model.fit(
    X_train_pad, y_train,
    epochs=3,          
    batch_size=128,
    validation_split=0.2, 
    verbose=1
)

Epoch 1/3
[1m 66/225[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m6:33[0m 2s/step - accuracy: 0.8085 - loss: 0.4583

## Evaluate the Model

In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print("Test Accuracy:", accuracy)

In [None]:
import numpy as np

predictions = (model.predict(X_test_pad) > 0.5).astype("int32")
print(predictions[:10].flatten())  

## analyze the model in more detail

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

print(classification_report(y_test, y_pred, target_names=["Real", "Fake"]))

## visualize training progress

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

## Predicting New News Articles

In [None]:
new_articles = [
    "Breaking: Scientists discover a new cure for common cold.",
    "Celebrity endorses miracle weight loss pill."
]

new_clean = [clean_text(text) for text in new_articles]

new_seq = tokenizer.texts_to_sequences(new_clean)

new_pad = pad_sequences(new_seq, maxlen=max_len)

predictions = (model.predict(new_pad) > 0.5).astype("int32")
for text, pred in zip(new_articles, predictions):
    label = "Fake" if pred[0] == 1 else "Real"
    print(f"Article: {text}\nPrediction: {label}\n")

In [None]:
test_articles = [
    "The government announced a new education policy starting next month.",
    "Local team wins the national soccer championship.",
    "Miracle cure for diabetes discovered in remote village.",
    "Stock market sees steady growth after quarterly earnings report."
]

test_clean = [clean_text(text) for text in test_articles]

test_seq = tokenizer.texts_to_sequences(test_clean)

test_pad = pad_sequences(test_seq, maxlen=max_len)

predictions = (model.predict(test_pad) > 0.5).astype("int32")

for text, pred in zip(test_articles, predictions):
    label = "Fake" if pred[0] == 1 else "Real"
    print(f"Article: {text}\nPrediction: {label}\n")

In [None]:
print(df['label'].value_counts())

In [None]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)

## Retrain Model

In [None]:
history = model.fit(
    X_train_pad, y_train,
    epochs=5,                
    batch_size=128,
    validation_split=0.2,
    class_weight={0:1.047, 1:0.957},
    verbose=1
)

In [None]:
test_articles = [
    "The government announced a new education policy starting next month.",
    "Local team wins the national soccer championship.",
    "Miracle cure for diabetes discovered in remote village.",
    "Stock market sees steady growth after quarterly earnings report."
]

test_clean = [clean_text(text) for text in test_articles]
test_seq = tokenizer.texts_to_sequences(test_clean)
test_pad = pad_sequences(test_seq, maxlen=max_len)

predictions = (model.predict(test_pad) > 0.5).astype("int32")

for text, pred in zip(test_articles, predictions):
    label = "Fake" if pred[0] == 1 else "Real"
    print(f"Article: {text}\nPrediction: {label}\n")

In [None]:
df['label'].value_counts()

In [None]:
df['full_text'] = df['title'] + " " + df['text']
X = df['full_text'].values
y = df['label'].values

In [None]:
df['full_text'] = df['title'] + " " + df['text']

df['CleanText'] = df['full_text'].apply(clean_text)

print(df[['full_text', 'CleanText']].head())

In [None]:
from sklearn.model_selection import train_test_split

X = df['CleanText'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = 200  

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)

In [None]:
history = model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.2,
    class_weight=class_weights_dict,
    verbose=1
)

In [None]:
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Loss: {test_loss}")

In [None]:
new_articles = [
    "The government announced a new education policy starting next month.",
    "Local team wins the national soccer championship.",
    "Miracle cure for diabetes discovered in remote village.",
    "Stock market sees steady growth after quarterly earnings report."
]

new_clean = [clean_text(text) for text in new_articles]
new_seq = tokenizer.texts_to_sequences(new_clean)
new_pad = pad_sequences(new_seq, maxlen=max_len)

predictions = (model.predict(new_pad) > 0.5).astype("int32")

for text, pred in zip(new_articles, predictions):
    label = "Fake" if pred[0] == 1 else "Real"
    print(f"Article: {text}\nPrediction: {label}\n")

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

In [None]:
texts = [
    "The government announced a new education policy starting next month.",
    "Local team wins the national soccer championship.",
    "Miracle cure for diabetes discovered in remote village.",
    "Stock market sees steady growth after quarterly earnings report."
]

seqs = tokenizer.texts_to_sequences(texts)
pad = pad_sequences(seqs, maxlen=max_len)
probs = model.predict(pad)

for i, p in enumerate(probs):
    print(f"Article: {texts[i]}")
    print(f"Predicted Probability: {p[0]:.4f}")
    print("Prediction:", "Real" if p[0] > 0.5 else "Fake", "\n")

In [None]:
model.save("fake_news_model.h5")

In [None]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
new_article = ["Breaking: Scientists develop new vaccine for flu."]
seq = tokenizer.texts_to_sequences(new_article)
padded = pad_sequences(seq, maxlen=100)
prob = model.predict(padded)[0][0]
prediction = "Real" if prob > 0.5 else "Fake"
print(f"Prediction: {prediction}, Probability: {prob}")


In [None]:
import pickle
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

model = load_model("fake_news_model.h5")

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

MAX_LEN = 100

def predict_news(article):
    seq = tokenizer.texts_to_sequences([article])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    prob = model.predict(padded)[0][0]
    label = "Real" if prob > 0.5 else "Fake"
    print(f"Article: {article}")
    print(f"Prediction: {label}, Probability: {prob:.4f}\n")

if __name__ == "__main__":
    articles = [
        "The government announced a new education policy starting next month.",
        "Local team wins the national soccer championship.",
        "Miracle cure for diabetes discovered in remote village.",
        "Stock market sees steady growth after quarterly earnings report."
    ]
    for article in articles:
        predict_news(article)

In [None]:
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

model = load_model("fake_news_model.h5")

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

MAX_LEN = 100

import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def predict_news(article):
    article_clean = clean_text(article)
    seq = tokenizer.texts_to_sequences([article_clean])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    prob = model.predict(padded)[0][0]
    label = "Real" if prob > 0.5 else "Fake"
    print(f"\nArticle: {article}")
    print(f"Prediction: {label}, Probability: {prob:.4f}")

if __name__ == "__main__":
    while True:
        user_input = input("\nEnter a news article (or type 'exit' to quit):\n")
        if user_input.lower() == "exit":
            print("Exiting program.")
            break
        predict_news(user_input)