## Environment Setup

In [1]:
import re
import os
import time
import json
import random
import requests
import pandas as pd
import local_settings as S
from datetime import datetime, timedelta
from newspaper import Article, Config
from bs4 import BeautifulSoup



In [2]:
# CONSTANTS
GCSJ_API_KEY = S.GCSJ_API_KEYS[4]
GCSJ_ENGINE_ID = S.GCSJ_ENGINE_IDS[4]

## Data Locator

In [3]:
def fetch_google_results(query):
    search_url = "https://www.googleapis.com/customsearch/v1"
    results = []
    for offset in [1, 11, 21, 31, 41]:
        params = {
            "key": GCSJ_API_KEY,
            "cx": GCSJ_ENGINE_ID,
            "start": offset,
            "lr": 'lang_en',
            "gl": 'us',
            "num": 10,
            "q": query,
        }
        res = requests.get(search_url, params=params)
        res.raise_for_status()
        result = res.json().get("items", [])
        results.extend(result)
    return results

In [4]:
def get_historical_news(count=20):
    results = []
    with open("./meta/completed_dates.csv", "r+") as f:
        from_date = datetime.strptime(f.readlines()[-1].strip(), '%Y-%m-%d')
        for ctr in range(1, count+1):
            dt = from_date + timedelta(days=ctr)
            today = dt.strftime('%Y-%m-%d')
            tomorrow = (dt + timedelta(days=1)).strftime('%Y-%m-%d')
            forbidden = ['youtube.com', 'twitter.com', 'facebook.com', 'instagram.com', 'reddit.com', ]
            forbidden = ' '.join([f'-site:{site}' for site in forbidden])
            query = f'US presidential election 2024 "news" {forbidden} after:{today} before:{tomorrow}'
            items = fetch_google_results(query)
            f.write(today + '\n')

            for item in items:
                item['target'] = today
                date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}'
                if matches := re.findall(date_pattern, json.dumps(item)):
                    item['time'] = datetime.fromisoformat(sorted(matches)[len(matches) // 2]).isoformat()
                obj = {
                    'target': item.get('target', None),
                    'time': item.get('time', None),
                    'link': item.get('link', None),
                }
                results.append(obj)

    results_df = pd.DataFrame(results)
    results_df.to_csv("./meta/search_results.csv", sep='|', index=False, mode='a', header=False)
    return results

In [5]:
%%script false --no-raise-error
get_historical_news()

In [6]:
def get_todays_news():
    dt = datetime.now()
    today = dt.strftime('%Y-%m-%d')
    tomorrow = (dt + timedelta(days=1)).strftime('%Y-%m-%d')
    query = f'stock market summary today india "nifty" "sensex" -site:youtube.com after:{today} before:{tomorrow}'
    items = fetch_google_results(query)
    results = []

    for item in items:
        date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}'
        if matches := re.findall(date_pattern, json.dumps(item)):
            item['time'] = datetime.fromisoformat(max(matches)).isoformat()
        obj = {
            'time': item.get('time', None),
            'link': item.get('link', None),
        }
        results.append(obj)

    results_df = pd.DataFrame(results)
    results_df['time'] = pd.to_datetime(results_df['time'])
    results_df.to_csv("./meta/search_results.csv", sep='|', index=False)
    return results

In [7]:
%%script false --no-raise-error
get_todays_news()

## Data Scraper

In [None]:
def get_config():
    config = Config()
    config.request_timeout = 10
    config.follow_meta_refresh = True
    config.memoize_articles = False
    config.fetch_images = False
    config.browser_user_agent = random.choice([
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/111.0.1661.62',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/110.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
    ])
    return config

In [9]:
class CustomArticle(Article):
    def build(self):
        super().build()
        soup = BeautifulSoup(self.html, 'html.parser')

        if not self.authors:
            candidates = map(lambda x: x.get_text().strip(), soup.select('a[href*=author]'))
            self.authors.extend(candidates)

        if not self.text:
            paragraphs = soup.find_all('p')
            lists = soup.find_all('ul')
            divs = soup.find_all('div')

            para_text = "\n".join([p.get_text().strip() for p in paragraphs])
            list_text = "\n".join([ul.get_text().strip() for ul in lists])
            divs_text = "\n".join([d.get_text().strip() for d in divs])

            self.text = para_text + "\n" + list_text + "\n" + divs_text
            self.text = self.text.strip()


    @property
    def datetime(self):
        date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}'
        json_str = json.dumps(self.meta_data)
        if matches := re.findall(date_pattern, json_str):
            latest_date = datetime.fromisoformat(max(matches))
            return latest_date.isoformat()

        date_patterns = [
            ("%B %d, %Y, %H:%M", r"\b\w+ \d{1,2}, \d{4}, \d{2}:\d{2}"),
            ("%b %d, %Y, %H:%M", r"\b\w{3} \d{1,2}, \d{4}, \d{2}:\d{2}"),
            ("%b %d, %Y %H:%M", r"\b\w{3} \d{1,2}, \d{4} \d{2}:\d{2}"),
            ("%d %b %I:%M %p", r"\b\d{1,2} \w{3} \d{1,2}:\d{2} (?:am|pm)"),
            ("%H:%M (IST) %d %b %Y", r"\d{2}:\d{2} \(IST\) \d{1,2} \w{3} \d{4}"),
        ]

        for fmt, rgx in date_patterns:
            for match in re.finditer(rgx, self.html, re.IGNORECASE):
                substring = match.group(0)
                try:
                    parsed_date = datetime.strptime(substring, fmt)
                    if parsed_date.year < 2000:     parsed_date = parsed_date.replace(year=datetime.now().year)
                    return parsed_date.isoformat()
                except ValueError:
                    continue

        return None

In [10]:
def scrape_article(obj):
    try:
        time.sleep(random.uniform(1, 3))
        article = CustomArticle(obj['link'], config=get_config())
        article.build()
        return {
            "url": obj['link'],
            "source_url": article.source_url,
            "title": article.title,
            "text": article.title+"\n"+article.text,
            "metadata": article.meta_data,
            "datetime": obj['time'] if not pd.isna(obj['time']) else article.datetime,
            "authors": article.authors,
            "description": article.meta_description,
        }
    except Exception as e:
        print(f"[{obj['target']}] Failed to scrape {obj['link']}: {e}")
        return None

In [11]:
# %%script false --no-raise-error
results_df = pd.read_csv('./meta/search_results.csv', sep='|', parse_dates=['time']).fillna('')
results_df.head()

Unnamed: 0,target,time,link
0,2024-01-01,NaT,https://www.reddit.com/r/podcasts/comments/18v...
1,2024-01-01,NaT,https://apnews.com/article/biden-trump-electio...
2,2024-01-01,2024-01-01 17:04:34,https://www.theguardian.com/us-news/2024/jan/0...
3,2024-01-01,2024-01-01 18:10:41,https://www.france24.com/en/europe/20240101-a-...
4,2024-01-01,2024-01-01 19:03:48,https://www.theguardian.com/us-news/2024/jan/0...


In [12]:
# %%script false --no-raise-error
data = results_df.apply(scrape_article, axis=1).tolist()
data = [item for item in data if item is not None]
with open(f'./data/rawdata_{datetime.now().timestamp()}.json', 'w') as f:
    json.dump(data, f, indent=4, sort_keys=True)

[2024-01-01] Failed to scrape https://www.france24.com/en/europe/20240101-a-lookahead-for-2024-us-elections-paris-olympics-cop-29-and-more: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.france24.com/en/europe/20240101-a-lookahead-for-2024-us-elections-paris-olympics-cop-29-and-more on URL https://www.france24.com/en/europe/20240101-a-lookahead-for-2024-us-elections-paris-olympics-cop-29-and-more
[2024-01-01] Failed to scrape https://www.politico.com/news/2024/01/01/what-to-watch-global-elections-2024-00133027: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.politico.com/news/2024/01/01/what-to-watch-global-elections-2024-00133027 on URL https://www.politico.com/news/2024/01/01/what-to-watch-global-elections-2024-00133027
[2024-01-01] Failed to scrape https://pbswisconsin.org/news-item/wisconsin-braces-for-another-consequential-election-cycle-in-2024/: Article `download()` failed with 403 Client Error: Forbidden for u

KeyboardInterrupt: 