In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def safe_request(url, delay=1):
    try:
        response = requests.get(url, timeout=10)
        time.sleep(delay)
        return response
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# 1. Alt News
def scrape_altnews(pages=3):
    headlines = []
    for page in range(1, pages + 1):
        url = f"https://www.altnews.in/category/fake-news/page/{page}/"
        res = safe_request(url)
        if res:
            soup = BeautifulSoup(res.text, "html.parser")
            for tag in soup.select("h3.entry-title a"):
                headlines.append({
                    "headline": tag.get_text(strip=True),
                    "label": "fake",
                    "source": "AltNews"
                })
    return headlines

# 2. Boom Live
def scrape_boomlive(pages=3):
    headlines = []
    for page in range(1, pages + 1):
        url = f"https://www.boomlive.in/fact-check/{page}"
        res = safe_request(url)
        if res:
            soup = BeautifulSoup(res.text, "html.parser")
            for tag in soup.select("h4.font-alt.normal.mt-10"):
                headlines.append({
                    "headline": tag.get_text(strip=True),
                    "label": "fake",
                    "source": "BoomLive"
                })
    return headlines

# 3. Factly
def scrape_factly(pages=3):
    headlines = []
    for page in range(1, pages + 1):
        url = f"https://factly.in/category/fake-news/page/{page}/"
        res = safe_request(url)
        if res:
            soup = BeautifulSoup(res.text, "html.parser")
            for tag in soup.select("h2.entry-title a"):
                headlines.append({
                    "headline": tag.get_text(strip=True),
                    "label": "fake",
                    "source": "Factly"
                })
    return headlines

# 🗃️ Aggregate everything
def collect_fake_news(total_pages=3):
    print("Scraping Alt News...")
    altnews = scrape_altnews(pages=total_pages)

    print("Scraping Boom Live...")
    boomlive = scrape_boomlive(pages=total_pages)

    print("Scraping Factly...")
    factly = scrape_factly(pages=total_pages)

    all_fake = altnews + boomlive + factly
    df = pd.DataFrame(all_fake)
    df.to_csv("fake_news_india.csv", index=False)
    print(f"✅ Collected {len(df)} fake news headlines. Saved to 'fake_news_india.csv'")
    return df

# Run the scraper
if __name__ == "__main__":
    df = collect_fake_news(total_pages=5)
    print(df.sample(5))

Scraping Alt News...
Scraping Boom Live...
Scraping Factly...
✅ Collected 15 fake news headlines. Saved to 'fake_news_india.csv'
                                            headline label    source
1  Video of Iran Missile Strike On Israel Linked ...  fake  BoomLive
0  Video Of Pak Soldiers Viral As Indian Army Wav...  fake  BoomLive
2  Doctored Republic Bharat Clip Viral As Pak Arm...  fake  BoomLive
7  Murshidabad Riots: BJP WB Uses CAA Protest Pho...  fake  BoomLive
8  Video From Maharashtra Viral Claiming Bajrang ...  fake  BoomLive


In [10]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.firefox import GeckoDriverManager
import time
import pandas as pd

def scrape_boomlive_full_scroll(url="https://www.boomlive.in/fact-check", max_wait=20):
    headlines = []

    options = Options()
    options.headless = True
    driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)

    try:
        driver.get(url)
        time.sleep(3)  # initial load

        last_count = 0
        same_count_times = 0

        print("🔁 Scrolling to load all headlines...")

        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # wait for new content

            elements = driver.find_elements(By.CSS_SELECTOR, "h4.font-alt.normal.mt-10")
            curr_count = len(elements)

            if curr_count == last_count:
                same_count_times += 1
            else:
                same_count_times = 0

            if same_count_times >= 3 or curr_count >= 100 or max_wait <= 0:
                break

            last_count = curr_count
            max_wait -= 1

        print(f"✅ Headlines loaded: {curr_count}")

        for elem in elements:
            text = elem.text.strip()
            if text:
                headlines.append({
                    "headline": text,
                    "label": "fake",
                    "source": "BoomLive"
                })

    finally:
        driver.quit()

    print(f"\n🟢 Total headlines scraped: {len(headlines)}")
    return headlines

# Run it
if __name__ == "__main__":
    data = scrape_boomlive_full_scroll()
    df = pd.DataFrame(data)
    df.to_csv("boomlive_fake_news_fullscroll.csv", index=False)
    print("✅ Saved to boomlive_fake_news_fullscroll.csv")



Python(11305) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11306) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11307) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11308) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11309) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11310) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11312) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11313) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11314) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11315) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(11316) Malloc

🔁 Scrolling to load all headlines...
✅ Headlines loaded: 3

🟢 Total headlines scraped: 3
✅ Saved to boomlive_fake_news_fullscroll.csv


In [1]:
import pandas as pd
from spellchecker import SpellChecker

# Load your dataset
df = pd.read_csv("Cleaned_FactDrill_Dataset.csv")

# Initialize spell checker
spell = SpellChecker(language='en')

# Find misspelled words
misspelled_records = []
for headline in df['headline'].dropna():
    words = headline.lower().split()
    misspelled = spell.unknown(words)
    if misspelled:
        misspelled_records.append({
            "headline": headline,
            "misspelled_words": list(misspelled)
        })

# Convert to DataFrame and optionally save
misspelled_df = pd.DataFrame(misspelled_records)
misspelled_df.to_csv("misspelled_headlines.csv", index=False)