In [None]:
import re
import os
import time
import json
import random
import requests
import pandas as pd
import local_settings as S
from datetime import datetime, timezone, timedelta
from newspaper import Article, Config
from bs4 import BeautifulSoup



In [None]:
# CONSTANTS
GCSJ_API_KEY = S.GCSJ_API_KEYS[4]
GCSJ_ENGINE_ID = S.GCSJ_ENGINE_IDS[4]

In [3]:
def fetch_google_results(query):
    search_url = "https://www.googleapis.com/customsearch/v1"
    results = []
    for offset in [1, 11, 21, 31, 41]:
        params = {
            "key": GCSJ_API_KEY,
            "cx": GCSJ_ENGINE_ID,
            "start": offset,
            "lr": 'lang_en',
            "gl": 'us',
            "num": 10,
            "q": query,
        }
        res = requests.get(search_url, params=params)
        res.raise_for_status()
        result = res.json().get("items", [])
        results.extend(result)
    return results

In [None]:
def get_historical_news(count=20):
    results = []
    with open("./data/completed_dates.csv", "r+") as f:
        from_date = datetime.strptime(f.readlines()[-1].strip(), '%Y-%m-%d')
        for ctr in range(1, count+1):
            dt = from_date + timedelta(days=ctr)
            today = dt.strftime('%Y-%m-%d')
            tomorrow = (dt + timedelta(days=1)).strftime('%Y-%m-%d')
            forbidden = ['youtube.com', 'twitter.com', 'facebook.com', 'instagram.com', 'reddit.com', ]
            forbidden = ' '.join([f'-site:{site}' for site in forbidden])
            query = f'US presidential election 2024 "news" {forbidden} after:{today} before:{tomorrow}'
            items = fetch_google_results(query)
            f.write(today + '\n')

            for item in items:
                item['target'] = today
                date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?'
                if matches := re.findall(date_pattern, json.dumps(item)):
                    item['time'] = sorted(matches)[len(matches) // 2]
                obj = {
                    'target': item.get('target', None),
                    'time': item.get('time', None),
                    'link': item.get('link', None),
                }
                results.append(obj)

    results_df = pd.DataFrame(results)
    results_df.to_csv(f"./data/search_results_{datetime.now().timestamp()}.csv", sep='|', index=False)
    return results

In [5]:
# %%script false --no-raise-error
get_historical_news()

[{'target': '2024-03-21',
  'time': '2024-09-13T13:26:30',
  'link': 'https://xk.usembassy.gov/our-relationship/uselection2024/'},
 {'target': '2024-03-21',
  'time': '2024-03-22T10:29:03',
  'link': 'https://osce.usmission.gov/invitation-to-observe-november-5-general-elections-in-the-united-states/'},
 {'target': '2024-03-21',
  'time': '2024-03-21T16:57:08',
  'link': 'https://www.pewresearch.org/2024/03/21/emotions-news-and-knowledge-about-the-israel-hamas-war/'},
 {'target': '2024-03-21',
  'time': None,
  'link': 'https://sos.ga.gov/news/georgia-audit-confirms-trump-victory'},
 {'target': '2024-03-21',
  'time': '2024-03-21T16:38:32',
  'link': 'https://osce.usmission.gov/on-the-russian-presidential-elections-and-russias-violations-of-osce-principles-and-commitments/'},
 {'target': '2024-03-21',
  'time': '2024-03-20T17:16:45',
  'link': 'https://www.washingtonpost.com/opinions/2024/03/21/mexico-immigration-pressure-biden-elections/'},
 {'target': '2024-03-21',
  'time': '2024-03-

In [None]:
def get_todays_news():
    dt = datetime.now()
    today = dt.strftime('%Y-%m-%d')
    tomorrow = (dt + timedelta(days=1)).strftime('%Y-%m-%d')
    query = f'stock market summary today india "nifty" "sensex" -site:youtube.com after:{today} before:{tomorrow}'
    items = fetch_google_results(query)
    results = []

    for item in items:
        date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}'
        if matches := re.findall(date_pattern, json.dumps(item)):
            item['time'] = datetime.fromisoformat(max(matches)).isoformat()
        obj = {
            'time': item.get('time', None),
            'link': item.get('link', None),
        }
        results.append(obj)

    results_df = pd.DataFrame(results)
    results_df['time'] = pd.to_datetime(results_df['time'])
    results_df.to_csv(f"./data/search_results_{datetime.now().timestamp()}.csv", sep='|', index=False)
    return results

In [7]:
%%script false --no-raise-error
get_todays_news()