Final Integrated Fake News Detection System

# === Imports ===

In [3]:
import cv2
import pytesseract
import pandas as pd
import numpy as np
import re
import pickle
import feedparser
import os
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from faster_whisper import WhisperModel
from pydub import AudioSegment
import yt_dlp

  from .autonotebook import tqdm as notebook_tqdm


# === TRAINING PART ===


# Load datasets

In [4]:
df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")
df_fake['label'] = 1
df_real['label'] = 0
df = pd.concat([df_fake, df_real]).reset_index(drop=True)
df['text'] = df['title'] + " " + df['text']


# Text Preprocessing

In [5]:
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower().strip()
    return text

df['text'] = df['text'].apply(preprocess)


# Embedding Model

In [6]:
model = SentenceTransformer('all-mpnet-base-v2')
print("Generating embeddings...")
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)


Generating embeddings...


Batches: 100%|██████████| 1404/1404 [32:33<00:00,  1.39s/it]


# Train-test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, df['label'], test_size=0.2, stratify=df['label'], random_state=42)

# Train classifier

In [8]:
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Evaluation

In [9]:
y_pred = clf.predict(X_test)
print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.9525612472160356
📊 Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      4284
           1       0.95      0.96      0.96      4696

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980

🔍 Confusion Matrix:
 [[4026  258]
 [ 168 4528]]


# Save classifier

In [18]:
with open("fake_news_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

# === REAL-TIME RSS NEWS CLASSIFICATION ===

In [11]:
print("\n📰 Real-Time News Classification:")
model = SentenceTransformer('all-mpnet-base-v2')
with open("fake_news_classifier.pkl", "rb") as f:
    clf = pickle.load(f)
rss_url = "http://feeds.bbci.co.uk/news/rss.xml"
feed = feedparser.parse(rss_url)

for entry in feed.entries[:10]:
    title = entry.title
    summary = entry.summary
    full_text = preprocess(title + " " + summary)
    embedding = model.encode([full_text])
    prediction = clf.predict(embedding)[0]
    label = "Fake ❌" if prediction == 1 else "Real ✅"
    print(f"\n🗞️ Title: {title}\n🔍 Summary: {summary}\n📢 Prediction: {label}")


📰 Real-Time News Classification:

🗞️ Title: 'End of an era': Last surviving Battle of Britain pilot dies
🔍 Summary: Mr Hemingway, who is originally from Dublin, joined the RAF as a teenager before World War Two.
📢 Prediction: Real ✅

🗞️ Title: Stranded astronauts Butch and Suni begin journey home
🔍 Summary: The Nasa astronauts prepare to head to Earth after an eight-day mission turned into nine months.
📢 Prediction: Real ✅

🗞️ Title: Crackdown on government credit cards to cut 'wasteful' spending
🔍 Summary: Purchases include £2,500 spent at a women's shoe shop in Barbados and £1,200 on luxury coffee pods.
📢 Prediction: Real ✅

🗞️ Title: Farmers fear criminal hare coursing gangs 'could kill someone'
🔍 Summary: Farmers warn there has been an increase in willingness to commit violence to farmers and gamekeepers.
📢 Prediction: Real ✅

🗞️ Title: Net zero by 2050 'impossible' for UK, says Badenoch
🔍 Summary: The Conservative leader says the target is impossible "without a serious drop in ou

# === MULTIMODAL FAKE NEWS DETECTION FUNCTIONS ===

# Predict text

In [12]:
def predict_fake_news(text):
    text = preprocess(text)
    embedding = model.encode([text])
    prediction = clf.predict(embedding)[0]
    return "Fake News" if prediction == 1 else "Real News"

# Predict from image

In [13]:
def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text.strip()

def predict_fake_news_from_image(image_path):
    text = extract_text_from_image(image_path)
    if not text:
        return "No text detected in image."
    return predict_fake_news(text)

# Predict from audio

In [14]:
def predict_fake_news_from_audio(audio_path):
    whisper_model = WhisperModel("base")
    segments, _ = whisper_model.transcribe(audio_path)
    transcribed_text = " ".join([segment.text for segment in segments])
    if not transcribed_text.strip():
        return "No speech detected in audio."
    return predict_fake_news(transcribed_text)

# Predict from YouTube

In [15]:
import yt_dlp
import os

def youtube_to_mp3_final(youtube_url, output_path="downloaded_audio.mp3"):
    """
    Download and convert YouTube video to MP3 using yt_dlp's FFmpegExtractAudio postprocessor.
    """
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_path.replace('.mp3', '.%(ext)s'),
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        # Optional if ffmpeg is not in PATH:
        # 'ffmpeg_location': 'C:/ffmpeg/bin'
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([youtube_url])
        print(f"✔ Audio successfully downloaded and converted to MP3 at: {output_path}")
        return output_path
    except Exception as e:
        print("❌ Error while processing YouTube audio:", e)
        return None


def predict_fake_news_from_youtube(youtube_url):
    audio_path = youtube_to_mp3_final(youtube_url)
    if audio_path and os.path.exists(audio_path):
        return predict_fake_news_from_audio(audio_path)
    return "Failed to process YouTube video."


# === Example Predictions ===

In [17]:
example_text = "Trump says Russia probe will be fair, but timeline unclear: NYT"
print("\nText Prediction:", predict_fake_news(example_text))

example_image = "news_image.jpg"  # Replace with an actual image
print("\nImage Prediction:", predict_fake_news_from_image(example_image))

example_audio = "Audio.mp3"  # Replace with actual audio
print("\nAudio Prediction:", predict_fake_news_from_audio(example_audio))

youtube_url = "https://youtube.com/shorts/8-CPIrXxYXk?si=JAYzO-0NfNMVVC6G"  # Replace with a valid URL
print("\nYouTube Video Prediction:", predict_fake_news_from_youtube(youtube_url))



Text Prediction: Real News

Image Prediction: Fake News

Audio Prediction: Real News
[youtube] Extracting URL: https://youtube.com/shorts/8-CPIrXxYXk?si=JAYzO-0NfNMVVC6G
[youtube] 8-CPIrXxYXk: Downloading webpage
[youtube] 8-CPIrXxYXk: Downloading tv client config
[youtube] 8-CPIrXxYXk: Downloading player 7d1d50a6
[youtube] 8-CPIrXxYXk: Downloading tv player API JSON
[youtube] 8-CPIrXxYXk: Downloading ios player API JSON
[youtube] 8-CPIrXxYXk: Downloading m3u8 information
[info] 8-CPIrXxYXk: Downloading 1 format(s): 251
[download] Destination: downloaded_audio.webm
[download] 100% of  908.79KiB in 00:00:00 at 1.91MiB/s   
[ExtractAudio] Destination: downloaded_audio.mp3
Deleting original file downloaded_audio.webm (pass -k to keep)
✔ Audio successfully downloaded and converted to MP3 at: downloaded_audio.mp3

YouTube Video Prediction: Real News
