In [16]:
import re
import json
import requests
import pandas as pd
import settings as S
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

In [17]:
# CONSTANTS
KEY_IDX = 4
GCSJ_API_KEY = S.GCSJ_API_KEYS[KEY_IDX]
GCSJ_ENGINE_ID = S.GCSJ_ENGINE_IDS[KEY_IDX]

In [18]:
def fetch_google_results(query):
    search_url = "https://www.googleapis.com/customsearch/v1"
    results = []
    for offset in [1, 11, 21, 31, 41]:
        params = {
            "key": GCSJ_API_KEY,
            "cx": GCSJ_ENGINE_ID,
            "start": offset,
            "lr": 'lang_en',
            "gl": 'us',
            "num": 10,
            "q": query,
        }
        res = requests.get(search_url, params=params)
        res.raise_for_status()
        result = res.json().get("items", [])
        results.extend(result)
    return results

In [19]:
def get_historical_news(count=20):
    results = []
    with open("./data/completed_dates.csv", "r+") as f:
        from_date = datetime.strptime(f.readlines()[-1].strip(), '%Y-%m-%d')
        for ctr in tqdm(range(1, count+1)):
            dt = from_date + timedelta(days=ctr)
            today = dt.strftime('%Y-%m-%d')
            tomorrow = (dt + timedelta(days=1)).strftime('%Y-%m-%d')
            forbidden = ['youtube.com', 'twitter.com', 'facebook.com', 'instagram.com', 'reddit.com', ]
            forbidden = ' '.join([f'-site:{site}' for site in forbidden])
            query = f'US presidential election 2024 "news" {forbidden} after:{today} before:{tomorrow}'
            items = fetch_google_results(query)
            f.write(today + '\n')

            for item in items:
                item['target'] = today
                date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?'
                if matches := re.findall(date_pattern, json.dumps(item)):
                    item['time'] = sorted(matches)[len(matches) // 2]
                obj = {
                    'target': item.get('target', None),
                    'time': item.get('time', None),
                    'link': item.get('link', None),
                }
                results.append(obj)

    results_df = pd.DataFrame(results)
    results_df.to_csv(f"./data/search_results_{datetime.now().timestamp()}.csv", sep='|', index=False)
    return results

In [None]:
# %%script false --no-raise-error
new_links = get_historical_news()

  0%|          | 0/20 [00:00<?, ?it/s]

[{'target': '2024-10-07',
  'time': None,
  'link': 'https://www.bbc.com/mediacentre/2024/how-to-follow-the-2024-us-presidential-election'},
 {'target': '2024-10-07',
  'time': None,
  'link': 'https://investor.lifestance.com/news-releases/news-release-details/lifestance-survey-finds-79-americans-are-experiencing-anxiety/'},
 {'target': '2024-10-07',
  'time': '2024-10-08T15:37:57+00:00',
  'link': 'https://www.opensecrets.org/news/2024/10/total-2024-election-spending-projected-to-exceed-previous-record/'},
 {'target': '2024-10-07',
  'time': '2024-10-21T19:08:58+00:00',
  'link': 'https://www.kff.org/compare-2024-candidates-health-care-policy/'},
 {'target': '2024-10-07',
  'time': '2024-10-08T09:00:00-04:00',
  'link': 'https://www.prnewswire.com/news-releases/lifestance-survey-finds-79-of-americans-are-experiencing-anxiety-over-the-2024-us-presidential-election-302269533.html'},
 {'target': '2024-10-07',
  'time': '2024-10-07T17:54:16.623',
  'link': 'https://www.wgbh.org/news/polit

In [None]:
import json
from ..kafka.scripts.kafka_producer import create_producer, send_message

producer = create_producer()
for obj in new_links:
    send_message(producer, 'news', obj)