In [None]:
#WEB-SCRAPPING
import requests
from bs4 import BeautifulSoup
import pandas as pd
import html
import json
import re

CATEGORIES = [
    "politics","world","us","business","health","science",
    "entertainment","tech","sports","opinion","media","education"
]

BASE_URL = "https://www.foxnews.com/category/{category}?page={page}"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def extract_author(soup):
    for tag in soup.find_all("script", type="application/ld+json"):
        if not tag.string:
            continue
        try:
            data = json.loads(tag.string)
        except:
            continue

        items = []
        if isinstance(data, dict):
            items.append(data)
            if "@graph" in data:
                items.extend(data["@graph"])
        elif isinstance(data, list):
            items.extend(data)

        for item in items:
            if not isinstance(item, dict):
                continue
            author = item.get("author") or item.get("creator")
            if not author:
                continue
            if isinstance(author, dict) and "name" in author:
                return author["name"]
            if isinstance(author, list) and len(author) > 0:
                first = author[0]
                if isinstance(first, dict) and "name" in first:
                    return first["name"]
            if isinstance(author, str):
                return author

    tag = soup.find("meta", {"name": "author"}) or soup.find("meta", {"property": "article:author"})
    if tag and tag.get("content"):
        return tag.get("content")

    byline = soup.find(string=re.compile(r"^\s*By\s+", re.I))
    if byline:
        return byline.strip()

    return ""

def clean_author(name):
    if not name:
        return ""
    name = html.unescape(name).strip()
    name = re.sub(r"^\s*By\s*[:\--]?\s*", "", name, flags=re.I)
    name = re.sub(r"(?:FOX NEWS.*$)", "", name, flags=re.I)
    name = re.split(r",| and | & ", name)[0].strip()
    m = re.match(r"^[A-Z][\w'’`.-]+(?:\s+[A-Z][\w'’`.-]+){0,4}$", name)
    return m.group(0) if m else name

def extract_body_json(soup):
    for tag in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(tag.string)
        except:
            continue
        if isinstance(data, dict):
            if "articleBody" in data:
                return data["articleBody"]
            if "@graph" in data:
                for x in data["@graph"]:
                    if isinstance(x, dict) and "articleBody" in x:
                        return x["articleBody"]
    return ""

def extract_body_html(soup):
    div = soup.find("div", class_="article-body")
    if not div:
        return ""
    return " ".join(p.get_text(" ", strip=True) for p in div.find_all("p")).strip()

def scrape_article(url, category):
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        title = soup.find("h1").get_text(strip=True) if soup.find("h1") else ""
        date = soup.find("time").get("datetime", "") if soup.find("time") else ""
        author = clean_author(extract_author(soup))

        body = extract_body_json(soup) or extract_body_html(soup)
        body = html.unescape(body)

        if len(body) < 200:
            return None

        return {
            "url": url,
            "title": title,
            "author": author,
            "date": date,
            "category": category,
            "articleBody": body
        }
    except:
        return None

def scrape_page(category, page):
    try:
        url = BASE_URL.format(category=category, page=page)
        r = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        links = {
            "https://www.foxnews.com" + a["href"]
            for a in soup.find_all("a", href=True)
            if a["href"].startswith(f"/{category}/")
        }
        return list(links)
    except:
        return []

def scrape_articles_all(categories, pages=20):
    total = []
    for cat in categories:
        for page in range(1, pages + 1):
            for url in scrape_page(cat, page):
                data = scrape_article(url, cat)
                if data:
                    total.append(data)
                if len(total) >= 1000:
                    pd.DataFrame(total).to_csv("foxnews_articles.csv", index=False)
                    return
    pd.DataFrame(total).to_csv("foxnews.csv", index=False)

scrape_articles_all(CATEGORIES)


In [None]:
#data cleaning
import pandas as pd
import html
import re

def fix_encoding(text):
    if pd.isna(text):
        return ""
    text = str(text)

    text = html.unescape(text)
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

df = pd.read_csv("foxnews_articles.csv")  

for col in ["title", "author", "articleBody"]:
    df[col] = df[col].apply(fix_encoding)

df.to_csv("foxnews_articless.csv", index=False) 
