In [1]:
import requests
import io
import pandas as pd
import sqlite3
import time
import json
from bs4 import BeautifulSoup
import plotly.express as px
import plotly.io as pio
from datetime import datetime

# Database setup
DB_NAME = "icann_announcements.db"
conn = sqlite3.connect(DB_NAME)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS announcements
             (id INTEGER PRIMARY KEY AUTOINCREMENT,
              title TEXT,
              date TEXT,
              link TEXT UNIQUE,
              content TEXT)''')
conn.commit()

In [2]:
BASE_URL = "https://www.icann.org/en/announcements"
DETAILS_URL_BASE = "https://www.icann.org/en/announcements/details/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def extract_data_from_html(html_content, page_num):
    soup = BeautifulSoup(html_content, 'html.parser')
    script_tag = soup.find('script', id='ng-state')
    
    if not script_tag:
        print(f"No ng-state script found on page {page_num}")
        return []
    
    try:
        json_data = json.loads(script_tag.string)
    except json.JSONDecodeError:
        print(f"Failed to parse JSON on page {page_num}")
        return []

    # Find the key containing 'searchAnnouncementsWithSuggestions'
    # It usually looks like searchAnnouncementsWithSuggestions-{"from":...}
    target_key = None
    for key in json_data.keys():
        if key.startswith("searchAnnouncementsWithSuggestions-"):
            target_key = key
            # We might want to ensure it corresponds to the current page offset if possible, 
            # but taking the first match is a reasonable start given the page structure.
            # Logic: Page 1 -> from:0, Page 2 -> from:20. 
            # We can verify 'from' matches, but let's blindly trust the data for now.
            break
    
    if not target_key:
        print(f"Target JSON key not found on page {page_num}")
        return []
        
    try:
        results_container = json_data[target_key]['data']['announcementOperations']['announcements']['results']
        
        articles = []
        for item in results_container:
            title = item.get('pageTitle', '').strip()
            date_str = item.get('pageDate', '')
            slug = item.get('slug', '')
            link = DETAILS_URL_BASE + slug if slug else ''
            
            if title and link:
                articles.append((title, date_str, link))
        return articles

    except KeyError as e:
        print(f"Key error parsing JSON on page {page_num}: {e}")
        return []


In [3]:
MAX_PAGES = 211  # As requested

for i in range(1, MAX_PAGES + 1):
    url = f"{BASE_URL}?page={i}"
    print(f"Scraping page {i}...")
    
    try:
        r = requests.get(url, headers=HEADERS)
        r.raise_for_status()
        
        articles = extract_data_from_html(r.content, i)
        
        if not articles:
            print(f"No articles found on page {i}. Stopping.")
            break
            
        new_count = 0
        for title, date, link in articles:
            try:
                # Insert into DB
                c.execute("INSERT OR IGNORE INTO announcements (title, date, link) VALUES (?, ?, ?)", 
                          (title, date, link))
                if c.rowcount > 0:
                    new_count += 1
            except sqlite3.Error as e:
                print(f"Database error: {e}")
                
        conn.commit()
        print(f"  Added {new_count} new articles.")
        
        if new_count == 0 and len(articles) > 0:
             # If all articles on the page were duplicates, we might want to stop if we assume chronological order and full history scrape is done.
             # But for this task, let's continue to the target page 211 as requested.
             pass

        time.sleep(1) # Be polite
        
    except Exception as e:
        print(f"Request failed for page {i}: {e}")
        # Optionally retry or continue
        continue

conn.close()
print("Scraping complete.")

Scraping page 1...
  Added 20 new articles.
Scraping page 2...
  Added 20 new articles.
Scraping page 3...
  Added 20 new articles.
Scraping page 4...
  Added 20 new articles.
Scraping page 5...
  Added 20 new articles.
Scraping page 6...
  Added 20 new articles.
Scraping page 7...
  Added 20 new articles.
Scraping page 8...
  Added 20 new articles.
Scraping page 9...
  Added 20 new articles.
Scraping page 10...
  Added 20 new articles.
Scraping page 11...
  Added 20 new articles.
Scraping page 12...
  Added 20 new articles.
Scraping page 13...
  Added 20 new articles.
Scraping page 14...
  Added 20 new articles.
Scraping page 15...
  Added 20 new articles.
Scraping page 16...
  Added 20 new articles.
Scraping page 17...
  Added 20 new articles.
Scraping page 18...
  Added 20 new articles.
Scraping page 19...
  Added 20 new articles.
Scraping page 20...
  Added 20 new articles.
Scraping page 21...
  Added 20 new articles.
Scraping page 22...
  Added 20 new articles.
Scraping page 23...

In [4]:
# Verification & Visualization
conn = sqlite3.connect(DB_NAME)
df = pd.read_sql_query("SELECT * FROM announcements", conn)
conn.close()

print(f"Total articles in database: {len(df)}")
print(df.head())

Total articles in database: 4220
   id                                              title        date  \
0   1  ICANN Publishes Fiscal Year 2026 Second Quarte...  2026-02-12   
1   2       ICANN Board Seeks Input on Bylaws Amendments  2026-02-11   
2   3                     ICANN85 Schedule Now Available  2026-02-09   
3   4  ICANN and UNESCO Announce Schedule of Events f...  2026-02-06   
4   5  ICANN Announces Recipients of the ICANN86 Fell...  2026-02-05   

                                                link content  
0  https://www.icann.org/en/announcements/details...    None  
1  https://www.icann.org/en/announcements/details...    None  
2  https://www.icann.org/en/announcements/details...    None  
3  https://www.icann.org/en/announcements/details...    None  
4  https://www.icann.org/en/announcements/details...    None  


In [5]:
# Convert date to datetime
df['date_obj'] = pd.to_datetime(df['date'])
df['year_month'] = df['date_obj'].dt.to_period('M').astype(str)

# Timeline Chart
timeline_counts = df.groupby('year_month').size().reset_index(name='count')
fig = px.bar(timeline_counts, x='year_month', y='count', title='ICANN Announcements Over Time')
fig.show()


In [6]:
# Word Cloud (Treemap approximation)
# Simple word frequency count
from collections import Counter
import re

all_text = ' '.join(df['title'].tolist())
words = re.findall(r'\w+', all_text.lower())
stop_words = {'icann', 'the', 'to', 'of', 'and', 'in', 'for', 'on', 'a', 'with', 'is', 'be', 'as', 'by', 'an', 'at', 'from'}
filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
word_counts = Counter(filtered_words)

# Top 50 words
top_words = pd.DataFrame(word_counts.most_common(50), columns=['word', 'count'])
fig_wc = px.treemap(top_words, path=['word'], values='count', title='Top Words in ICANN Announcements')
fig_wc.show()