# Data Preparation

## Environment Setup

In [1]:
%pip install -r '../requirements.txt'

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import os
import time
import json
import random
import requests
import pandas as pd
from datetime import datetime, timedelta
from newspaper import Article, Config
from bs4 import BeautifulSoup



In [3]:
# CONSTANTS
GCSJ_API_KEY = os.environ.get('GCSJ_API_KEY')
GCSJ_ENGINE_ID = os.environ.get('GCSJ_ENGINE_ID')

## Data Locator

In [4]:
def generate_weekdays(num_days, offset=0):
    dt = datetime.today() - timedelta(days=offset-1)
    while num_days > 0:
        dt -= timedelta(days=1)
        if dt.weekday() < 5:
            num_days -= 1
            yield dt

In [5]:
def fetch_google_results(query):
    search_url = "https://www.googleapis.com/customsearch/v1"
    results = []
    for offset in [1, 11, 21]:
        params = {
            "key": GCSJ_API_KEY,
            "cx": GCSJ_ENGINE_ID,
            "start": offset,
            "dateRestrict": 'd1',
            "lr": 'lang_en',
            "gl": 'in',
            "num": 10,
            "q": query,
        }
        res = requests.get(search_url, params=params)
        res.raise_for_status()
        result = res.json().get("items", [])
        results.extend(result)
    return result

In [6]:
def get_todays_news():
    dt = datetime.now()
    today = dt.strftime('%Y-%m-%d')
    tomorrow = (dt + timedelta(days=1)).strftime('%Y-%m-%d')
    query = f'stock market summary today india "nifty" "sensex" -site:youtube.com after:{today} before:{tomorrow}'
    items = fetch_google_results(query)
    results = []

    for item in items:
        date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}'
        if matches := re.findall(date_pattern, json.dumps(item)):
            item['time'] = datetime.fromisoformat(max(matches)).isoformat()
        obj = {
            'time': item.get('time', None),
            'link': item.get('link', None),
        }
        results.append(obj)

    results_df = pd.DataFrame(results)
    results_df['time'] = pd.to_datetime(results_df['time'])
    results_df.to_csv("../Dataset/scraper/raw/search_results.csv", sep='|', index=False)
    return results

In [None]:
# %%script false --no-raise-error
get_todays_news()

## Data Scraper

In [8]:
def get_config():
    config = Config()
    config.request_timeout = 10
    config.memoize_articles = False
    config.fetch_images = False
    config.browser_user_agent = random.choice([
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
    ])
    return config

In [9]:
class CustomArticle(Article):
    def build(self):
        super().build()
        soup = BeautifulSoup(self.html, 'html.parser')

        if not self.authors:
            candidates = map(lambda x: x.get_text().strip(), soup.select('a[href*=author]'))
            self.authors.extend(candidates)

        if not self.text:
            paragraphs = soup.find_all('p')
            lists = soup.find_all('ul')
            divs = soup.find_all('div')

            para_text = "\n".join([p.get_text().strip() for p in paragraphs])
            list_text = "\n".join([ul.get_text().strip() for ul in lists])
            divs_text = "\n".join([d.get_text().strip() for d in divs])

            self.text = para_text + "\n" + list_text + "\n" + divs_text
            self.text = self.text.strip()


    @property
    def datetime(self):
        date_pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}'
        json_str = json.dumps(self.meta_data)
        if matches := re.findall(date_pattern, json_str):
            latest_date = datetime.fromisoformat(max(matches))
            return latest_date.isoformat()

        date_patterns = [
            ("%B %d, %Y, %H:%M", r"\b\w+ \d{1,2}, \d{4}, \d{2}:\d{2}"),
            ("%b %d, %Y, %H:%M", r"\b\w{3} \d{1,2}, \d{4}, \d{2}:\d{2}"),
            ("%b %d, %Y %H:%M", r"\b\w{3} \d{1,2}, \d{4} \d{2}:\d{2}"),
            ("%d %b %I:%M %p", r"\b\d{1,2} \w{3} \d{1,2}:\d{2} (?:am|pm)"),
            ("%H:%M (IST) %d %b %Y", r"\d{2}:\d{2} \(IST\) \d{1,2} \w{3} \d{4}"),
        ]

        for fmt, rgx in date_patterns:
            for match in re.finditer(rgx, self.html, re.IGNORECASE):
                substring = match.group(0)
                try:
                    parsed_date = datetime.strptime(substring, fmt)
                    if parsed_date.year < 2000:     parsed_date = parsed_date.replace(year=datetime.now().year)
                    return parsed_date.isoformat()
                except ValueError:
                    continue

        return None

In [10]:
def scrape_article(obj):
    try:
        time.sleep(random.uniform(0, 2))
        article = CustomArticle(obj['link'], config=get_config())
        article.build()
        return {
            "url": obj['link'],
            "source_url": article.source_url,
            "title": article.title,
            "text": article.title+"\n"+article.text,
            "metadata": article.meta_data,
            "datetime": obj['time'] if not pd.isna(obj['time']) else article.datetime,
            "authors": article.authors,
            "description": article.meta_description,
        }
    except Exception as e:
        print(f"Failed to scrape {obj['link']}: {e}")
        return None

In [None]:
# %%script false --no-raise-error
results_df = pd.read_csv('../Dataset/scraper/raw/search_results.csv', sep='|', parse_dates=['time']).fillna('')
results_df.head()

In [None]:
# %%script false --no-raise-error
data = results_df.apply(scrape_article, axis=1).tolist()
data = [item for item in data if item is not None]
with open(f'../Dataset/scraper/raw/rawdata_{datetime.now().timestamp()}.json', 'w') as f:
    json.dump(data, f, indent=4, sort_keys=True)