In [None]:
#pip install sentence-transformers scikit-learn pandas numpy

In [1]:
#pip install opencv-python pytesseract sentence-transformers scikit-learn pandas numpy faster-whisper flask-cors

In [2]:
#pip install opencv-python pytesseract sentence-transformers scikit-learn pandas numpy faster-whisper yt-dlp flask-cors pydub


In [3]:
import cv2
import pytesseract
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


  from .autonotebook import tqdm as notebook_tqdm


Load dataset (Assume 'X' contains news text, 'y' contains labels: 0=Real, 1=Fake)

In [4]:
# Load datasets (each file contains one article per line)
with open("Fake.csv", "r", encoding="utf-8") as f:
    fake_news = f.readlines()

with open("True.csv", "r", encoding="utf-8") as f:
    real_news = f.readlines()


In [5]:
# Load pre-trained embedding model 
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
# Create DataFrame
df_fake = pd.DataFrame({"text": fake_news, "label": 1})  # Fake news = 1
df_real = pd.DataFrame({"text": real_news, "label": 0})  # Real news = 0

df = pd.concat([df_fake, df_real], ignore_index=True)

In [7]:
# Clean and encode text
df = df.dropna(subset=["text"])  # Remove NaN values
df = df[df["text"].str.strip() != ""]  # Remove empty strings
df["embeddings"] = df["text"].apply(lambda x: model.encode(x))  # Encode text

In [8]:
# Convert to NumPy arrays
X = np.vstack(df["embeddings"].values)
y = df["label"].values

In [9]:
#df = df.dropna(subset=["text"])  # Remove NaN values
#df = df[df["text"].str.strip() != ""]  # Remove empty strings

In [10]:
#df["embeddings"] = model.encode(df["text"].tolist())


In [11]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [13]:
# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.93


In [14]:
# Function to predict fake news from text
def predict_fake_news(text):
    embedding = model.encode(text)
    prediction = clf.predict([embedding])[0]
    return "Fake News" if prediction == 1 else "Real News"

# Test exampleMisinformation
example_text = "Trump says Russia probe will be fair, but timeline unclear: NYT"
print(predict_fake_news(example_text))


Real News


In [15]:
# Function to extract text from an image
def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
    text = pytesseract.image_to_string(gray)  # Extract text
    return text.strip()

# Function to predict fake news from an image
def predict_fake_news_from_image(image_path):
    text = extract_text_from_image(image_path)
    if not text:
        return "No text detected in image."
    return predict_fake_news(text)

In [16]:
from faster_whisper import WhisperModel

# Function to transcribe audio and predict fake news
def predict_fake_news_from_audio(audio_path):
    whisper_model = WhisperModel("base")
    segments, _ = whisper_model.transcribe(audio_path)
    transcribed_text = " ".join([segment.text for segment in segments])
    if not transcribed_text.strip():
        return "No speech detected in audio."
    return predict_fake_news(transcribed_text)

In [17]:
#pip install opencv-python pytesseract sentence-transformers scikit-learn pandas numpy faster-whisper yt-dlp pydub ffmpeg-python


In [28]:
import yt_dlp
import os

def youtube_to_mp3_final(youtube_url, output_path="downloaded_audio.mp3"):
    """
    Download and convert YouTube video to MP3 using yt_dlp's FFmpegExtractAudio postprocessor.
    """
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_path.replace('.mp3', '.%(ext)s'),
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        # Optional if ffmpeg is not in PATH:
        # 'ffmpeg_location': 'C:/ffmpeg/bin'
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([youtube_url])
        print(f"✔ Audio successfully downloaded and converted to MP3 at: {output_path}")
        return output_path
    except Exception as e:
        print("❌ Error while processing YouTube audio:", e)
        return None


def predict_fake_news_from_youtube(youtube_url):
    audio_path = youtube_to_mp3_final(youtube_url)
    if audio_path and os.path.exists(audio_path):
        return predict_fake_news_from_audio(audio_path)
    return "Failed to process YouTube video."


In [19]:
#pip install whisper

In [20]:
#pip install -U openai-whisper ffmpeg-python torch

In [21]:
#pip install -U faster-whisper

In [29]:
import pickle

# Save the trained model
with open("fake_news_model.pkl", "wb") as model_file:
    pickle.dump(clf, model_file)

In [None]:
#pip install flask-cors


Note: you may need to restart the kernel to use updated packages.


In [30]:
# Example predictions
example_text = "Trump says Russia probe will be fair, but timeline unclear: NYT"
print("Text Prediction:", predict_fake_news(example_text))

example_image = "news_image.jpg"  # Replace with an actual image file
print("Image Prediction:", predict_fake_news_from_image(example_image))

example_audio = "Audio.mp3"  # Replace with an actual audio file
print("Audio Prediction:", predict_fake_news_from_audio(example_audio))

youtube_url = "https://www.youtube.com/shorts/n7RDui1hOdY"  # Replace with a real YouTube link
print("YouTube Video Prediction:", predict_fake_news_from_youtube(youtube_url))

Text Prediction: Real News
Image Prediction: Fake News
Audio Prediction: Real News
[youtube] Extracting URL: https://www.youtube.com/shorts/n7RDui1hOdY
[youtube] n7RDui1hOdY: Downloading webpage
[youtube] n7RDui1hOdY: Downloading tv client config
[youtube] n7RDui1hOdY: Downloading player e011b4d7
[youtube] n7RDui1hOdY: Downloading tv player API JSON
[youtube] n7RDui1hOdY: Downloading ios player API JSON
[youtube] n7RDui1hOdY: Downloading m3u8 information
[info] n7RDui1hOdY: Downloading 1 format(s): 251
[download] Destination: downloaded_audio.webm
[download] 100% of    1.00MiB in 00:00:00 at 3.13MiB/s   
[ExtractAudio] Destination: downloaded_audio.mp3
Deleting original file downloaded_audio.webm (pass -k to keep)
✔ Audio successfully downloaded and converted to MP3 at: downloaded_audio.mp3
YouTube Video Prediction: Real News


In [None]:
#pip install yt-dlp


Note: you may need to restart the kernel to use updated packages.


In [None]:
#pip install --upgrade yt-dlp


Note: you may need to restart the kernel to use updated packages.


In [None]:
#pip install yt-dlp flask flask-cors


Note: you may need to restart the kernel to use updated packages.
