In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_nation():
    base_url = "https://nation.africa/kenya"
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " 
                  "AppleWebKit/537.36 (KHTML, like Gecko) " 
                  "Chrome/124.0.0.0 Safari/537.36"
}
    response = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    articles = []

    # Step 1: Get all article links + titles from homepage
    for tag in soup.select("a"):
        title = tag.text.strip()
        url = tag.get("href")

        # Clean and validate
        if (not url or not url.startswith("http") or 
            "/videos/" in url or 
            "/video/" in url):
            continue

        if not title or len(title) < 15:
            continue

        # Step 2: Scrape each article
        try:
            article_response = requests.get(url, headers= headers)
            article_soup = BeautifulSoup(article_response.content, "html.parser")

            # Date extraction
            date_tag = article_soup.select_one("time.date")
            date = date_tag.text.strip() if date_tag else ""

            # Content extraction
            paragraphs = article_soup.select("div.text-block.blk-txt div.paragraph-wrapper p")
            content = "\n".join(p.text.strip() for p in paragraphs if p.text.strip())

            # Final check — avoid saving empty articles
            if not content:
                continue

            # Append to list
            articles.append({
                "title": title,
                "url": url,
                "date": date,
                "content": content,
                "label": "real",
                "source": "Nation"
            })

            time.sleep(0.5)

        except Exception as e:
            print(f"Error scraping {url}: {e}")
            continue

    df = pd.DataFrame(articles)
    return df


In [6]:
df = scrape_nation()
df.head()

In [7]:
df.shape

(0, 0)

In [None]:
print(soup.prettify()[:2000])  # show only first 2,000 characters


In [8]:
base_url = "https://nation.africa/kenya"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " 
                  "AppleWebKit/537.36 (KHTML, like Gecko) " 
                  "Chrome/124.0.0.0 Safari/537.36"
}
response = requests.get(base_url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify()[:2000])  # show only first 2,000 characters

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-US">
 <!--<![endif]-->
 <head>
  <title>
   Attention Required! | Cloudflare
  </title>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <meta content="noindex, nofollow" name="robots"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <link href="/cdn-cgi/styles/cf.errors.css" id="cf_styles-css" rel="stylesheet"/>
  <!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" /><![endif]-->
  <style>
   body{margin:0;padding:0}
  </style>
  <!--[if gte IE 10]><!-->
  <script>
   if (!navigator.cookieEnabled) {
    window.addEven