In [1]:
import re
import os
import glob
import time
import json
import random
import requests
import pandas as pd
from datetime import datetime, timezone, timedelta
from newspaper import Article, Config
from bs4 import BeautifulSoup

In [2]:
no_tz = lambda dt:   dt.astimezone(timezone.utc).replace(tzinfo=None) if dt.tzinfo else dt

In [3]:
def get_config():
    config = Config()
    config.request_timeout = 10
    config.follow_meta_refresh = True
    config.memoize_articles = False
    config.fetch_images = False
    config.browser_user_agent = random.choice([
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/111.0.1661.62',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/110.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
    ])
    return config

In [4]:
class CustomArticle(Article):
    def build(self):
        super().build()
        soup = BeautifulSoup(self.html, 'html.parser')

        if not self.authors:
            candidates = map(lambda x: x.get_text().strip(), soup.select('a[href*=author]'))
            self.authors.extend(candidates)

        if not self.text:
            paragraphs = soup.find_all('p')
            lists = soup.find_all('ul')
            divs = soup.find_all('div')

            para_text = "\n".join([p.get_text().strip() for p in paragraphs])
            list_text = "\n".join([ul.get_text().strip() for ul in lists])
            divs_text = "\n".join([d.get_text().strip() for d in divs])

            self.text = para_text + "\n" + list_text + "\n" + divs_text
            self.text = self.text.strip()


    @property
    def datetime(self):
        date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?'
        json_str = json.dumps(self.meta_data)
        if matches := re.findall(date_pattern, json_str):
            dt = datetime.fromisoformat(sorted(matches)[len(matches) // 2])
            return no_tz(dt).isoformat()

        date_patterns = [
            ("%B %d, %Y, %H:%M", r"\b\w+ \d{1,2}, \d{4}, \d{2}:\d{2}"),
            ("%b %d, %Y, %H:%M", r"\b\w{3} \d{1,2}, \d{4}, \d{2}:\d{2}"),
            ("%b %d, %Y %H:%M", r"\b\w{3} \d{1,2}, \d{4} \d{2}:\d{2}"),
            ("%d %b %I:%M %p", r"\b\d{1,2} \w{3} \d{1,2}:\d{2} (?:am|pm)"),
            ("%H:%M (IST) %d %b %Y", r"\d{2}:\d{2} \(IST\) \d{1,2} \w{3} \d{4}"),
        ]

        for fmt, rgx in date_patterns:
            for match in re.finditer(rgx, self.html, re.IGNORECASE):
                substring = match.group(0)
                try:
                    parsed_date = datetime.strptime(substring, fmt)
                    if parsed_date.year < 2000:     parsed_date = parsed_date.replace(year=datetime.now().year)
                    return parsed_date.isoformat()
                except ValueError:
                    continue

        return None

In [5]:
def scrape_article(obj):
    try:
        time.sleep(random.uniform(1, 2))
        article = CustomArticle(obj['link'], config=get_config())
        article.build()
        return {
            "url": obj['link'],
            "source_url": article.source_url,
            "title": article.title,
            "text": article.title+"\n"+article.text,
            "metadata": article.meta_data,
            "authors": article.authors,
            "description": article.meta_description,
            "date_metadata": article.datetime,
            "date_published": no_tz(article.publish_date).isoformat() if article.publish_date else None,
            "date_google": no_tz(datetime.fromisoformat(obj['time'])).isoformat() if obj['time'] else None,
            "date_target": no_tz(datetime.fromisoformat(obj['target'])).isoformat() if obj['target'] else None,
        }
    except Exception as e:
        print(f"[{obj['target']}] Failed to scrape {obj['link']}: {e}")
        return obj

In [None]:
# %%script false --no-raise-error
for filename in sorted(glob.glob('../news_links/data/search_results_*.csv')):
    TS = re.search(r'_(\d+\.\d+)\.', filename).group(1)
    if os.path.exists(f'./data/newsdata_{TS}.json'):    continue

    print(f"Processing: {filename}")
    results_df = pd.read_csv(filename, sep='|').fillna('').sample(10)
    raw = results_df.apply(scrape_article, axis=1).tolist()
    data = [item for item in raw if 'source_url' in item.keys()]
    redo = [item for item in raw if 'source_url' not in item.keys()]
    with open(f'./data/newsdata_{TS}.json', 'w') as f:
        json.dump(data, f, indent=4, sort_keys=True)
    pd.DataFrame(redo).to_csv(f'./redo/search_results_{TS}.csv', sep='|', index=False)
    time.sleep(10)

Processing: ../news_links/data/search_results_1732512728.458377.csv
