# Scrape FT Times Newsfeed

My `scraper/ft.py` file has approximately the same code for link off https://www.ft.com/news-feed, and then scraping them into a `List[Document]`. But because I want to mass download a bunch, I figured it's best to just download the xml into a directory first - this way if I mess up the `Document` conversion, I don't have to redo anything. Plus, I'm basically making a database.

## Scraping Links

In [4]:
import asyncio
import httpx
import json
from datetime import datetime
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

with open("../../.ft-headers.json") as f:
    headers = json.load(f)

HTTPX_CONNECTION_LIMITS = httpx.Limits(max_keepalive_connections=50, max_connections=400)

In [2]:
# modififed `fetch_news_feed` that fetches 400 pages of news feed
async def fetch_news_feed(headers, max_pages = 400):
    links = []
    client = httpx.AsyncClient(timeout=10.0, headers=headers)
    url = "https://www.ft.com/news-feed"

    async def send_requests_for_links(page):
        try:
            params = {"page": page}
            response = await client.get(url, params=params)
            response.raise_for_status()
            html = response.text
            soup = BeautifulSoup(html, "lxml")

            a_tags = soup.find_all("a", class_="js-teaser-heading-link")
            return ["https://www.ft.com" + a_tag.get("href") for a_tag in a_tags]
        except Exception as e:
            print(f"Error fetching news feed at page {page}: {e}")
            return []


    try:
        # for start_page in range(1, max_pages+1, HTTPX_CONNECTION_LIMITS.max_connections):
        #     tasks = []
        #     for page in range(start_page, min(start_page+HTTPX_CONNECTION_LIMITS.max_connections, max_pages+1)):
        #         tasks.append(send_requests_for_links(page))
        #     links.extend(await tqdm.gather(*tasks))
        links = await tqdm.gather(*(send_requests_for_links(page) for page in range(1, max_pages + 1)))
    finally:
        await client.aclose()
    return links

In [None]:
links = await fetch_news_feed(headers)
links = [link for sublist in links for link in sublist] # flatten list
print(f"Fetched {len(links)} links from Financial Times news feed")

## Scrape HTML and store them
yea i have no idea if i missed a few articles but i think i got everything
got like `9986` articles

note to self: i also moved where i stored the ft data - its now in a hard-drive rather than in wherever the directory is written down below

In [2]:
async def async_scrape_link(url, client, retries = 3, backoff = 0.5):
    for attempt in range(retries):
        try:
            r = await client.get(url)
            r.raise_for_status()
            return r.text
        except Exception as e:
            await asyncio.sleep(backoff * 2 ** attempt)
    print(f"Failed to fetch {url} after {retries} attempts")
    return None

In [None]:
directory = "/Users/danielliu/Workspace/fin-rag/data/ft" # location moved

client = httpx.AsyncClient(timeout=10.0, headers=headers, limits=HTTPX_CONNECTION_LIMITS)
f_meta = open(f"{directory}/ft_scrape_info.txt", "w")
try:
    f_meta.write("id\turl\tscraped_at\n")
    for i in tqdm_notebook(range(0, len(links), HTTPX_CONNECTION_LIMITS.max_connections), desc="Async scraping links"):
        link_subset = links[i:i+HTTPX_CONNECTION_LIMITS.max_connections]
        tasks = [async_scrape_link(link, client) for link in link_subset]
        htmls = await asyncio.gather(*tasks)

        scraped_at = datetime.now().isoformat()
        for link, html in zip(link_subset, htmls):
            if html is None:
                continue
            id = link.split("/")[-1]
            f_meta.write(f"{id}\t{link}\t{scraped_at}\n")
            with open(f"{directory}/{id}.html", "w") as f:
                f.write(html)
finally:
    f_meta.close()
    await client.aclose()