In [224]:
import requests, pandas as pd, json, re
from bs4 import BeautifulSoup

## Parse list of articles

In [317]:
url = "https://www.economicsobservatory.com/answers/"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
nrOfPages = int(
    soup.find("section", {"class": "answers__listing"})
    .find("div", {"class": "pagination"})
    .find("span")
    .text.split(" of ")[1]
    .split(" ")[0]
)

In [318]:
articles = []
# articles = json.loads(open("articles.json", "r").read())
articles = {i["name"]: i for i in articles}

In [319]:
for page in range(1, nrOfPages + 1):
    url = "https://www.economicsobservatory.com/answers/page/" + str(page)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    for k in (
        soup.find("section", {"class": "answers__listing"})
        .find("div", {"class": "answers__listing-left"})
        .find("ul")
        .findAll("li")
    ):
        paragraphs = k.find("div").findAll("div")
        para0 = paragraphs[0].text.split(" • ")
        category = para0[0].strip()
        date = str(pd.to_datetime(para0[1]))[:10]
        title = paragraphs[1].find("a").text
        link = paragraphs[1].find("a")["href"]
        name = link.split("/")[-1]
        if name not in articles:
            articles[name] = {
                "name": name,
                "category": category,
                "date": date,
                "title": title,
                "url": link,
            }
    print(page, " of ", nrOfPages)

1  of  9
2  of  9
3  of  9
4  of  9
5  of  9
6  of  9
7  of  9
8  of  9
9  of  9


Make local backup or article metadata

In [320]:
open("articles.json", "w").write(json.dumps(list(articles.values())))

118909

## Parse individual articles

In [321]:
authors = []
# authors = json.loads(open("authors.json", "r").read())
authors = {i["name"]: i for i in authors}
figures = []
# figures = json.loads(open("figures.json", "r").read())
figures = {i["name"]: i for i in figures}

In [334]:
# for a in articles:
for a in ['how-can-labour-market-policy-help-get-people-back-right-jobs']:
    article = articles[a]
#     if "text" not in article:
    if True:

        # Load article page
        url = article["url"]
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")

        # Extract content
        relatedCategories = [
            i.find("a").text
            for i in soup.find("ul", {"class": "article__sidebar-categories"}).findAll(
                "li"
            )
        ]
        # articleIntro = soup.find("div", {"class", "article__intro"}).find("h3").text
        if soup.find("div", {"class", "article__intro"}):
            articleIntro = re.sub(
                "\xa0",
                " ",
                re.sub("\n", "", soup.find("div", {"class", "article__intro"}).text),
            ).strip()
        articleText = "\n".join(
            [i.text for i in soup.find("div", {"class", "article__body"}).findAll("p")]
        )

        # Extract authors
        articleAuthors = []
        if soup.find("ul", {"class": "article__authors-list"}):
            for authorData in soup.find(
                "ul", {"class": "article__authors-list"}
            ).findAll("li"):
                author = authorData.find("div", {"class": "title"}).text
                articleAuthors.append(author)
                if author not in authors:
                    affiliation = authorData.find("div", {"class": "aff"}).text
                    profile = authorData.find("a", {"class": "link"})["href"]
                    image = (
                        authorData.find("div", {"class", "image"})["style"]
                        .split("url")[1]
                        .split("'")[1]
                    )
                    authors[author] = {
                        "name": author,
                        "affiliation": affiliation,
                        "profile": profile,
                        "image": image,
                        "articles": [],
                        "expert": [],
                        "categories": [],
                        "related": [],
                    }
                if article["name"] not in authors[author]["articles"]:
                    authors[author]["articles"].append(article["name"])
                if article["category"] not in authors[author]["categories"]:
                    authors[author]["categories"].append(article["category"])
                for relatedCategory in relatedCategories:
                    if relatedCategory not in authors[author]["related"]:
                        authors[author]["related"].append(relatedCategory)

        # Extract figures
        fig0 = "Figure 1: "
        abc = "abcdefghijklmnopqrstuv"
        counter = 0
        articleFigures = []
        for fig in [i.text for i in soup.findAll("h4")]:
            if fig.strip()[:5] != "Table":
                if ":" not in fig:
                    fig = fig0.split(":")[0] + abc[counter] + ": "
                    counter += 1
                else:
                    fig0 = fig
                figId = "fig" + fig.split(":")[0].split(" ")[1]
                figName = article["name"] + "_" + figId
                articleFigures.append(figName)
                if figName not in figures:
                    figTitle = fig.split(":")[1].strip()
                    figures[figName] = {
                        "name": figName,
                        "title": figTitle,
                        "articles": [article["name"]],
                    }
                elif article["name"] not in figures[figName]["articles"]:
                    figures[figName]["articles"].append(article["name"])

        # Extract experts
        articleExperts = []
        if soup.find("ul", {"class": "article__sidebar-experts"}):
            for authorData in soup.find(
                "ul", {"class": "article__sidebar-experts"}
            ).findAll("li"):
                author = authorData.find("div", {"class": "title"}).text
                articleExperts.append(author)
                if author not in authors:
                    affiliation = authorData.find("div", {"class": "aff"}).text
                    profile = authorData.find("a", {"class": "link"})["href"]
                    image = (
                        authorData.find("div", {"class", "image"})["style"]
                        .split("url")[1]
                        .split("'")[1]
                    )
                    authors[author] = {
                        "name": author,
                        "affiliation": affiliation,
                        "profile": profile,
                        "image": image,
                        "articles": [],
                        "expert": [],
                        "categories": [],
                        "related": [],
                    }
                if article["name"] not in authors[author]["expert"]:
                    authors[author]["expert"].append(article["name"])
                if article["category"] not in authors[author]["categories"]:
                    authors[author]["categories"].append(article["category"])
                for relatedCategory in relatedCategories:
                    if relatedCategory not in authors[author]["related"]:
                        authors[author]["related"].append(relatedCategory)

        # Augment article data
        article["authors"] = articleAuthors
        article["figures"] = articleFigures
        article["experts"] = articleExperts
        article["related"] = relatedCategories
        article["intro"] = articleIntro
        article["text"] = articleText

        print(article["name"])

how-can-labour-market-policy-help-get-people-back-right-jobs


In [336]:
for fig in [i.text for i in soup.findAll("h4")]:
    print(fig)
    #     if fig.strip()[:5] != "Table":
#         if ":" not in fig:
#             fig = fig0.split(":")[0] + abc[counter] + ": "
#             counter += 1
#         else:
#             fig0 = fig
#         figId = "fig" + fig.split(":")[0].split(" ")[1]
#         figName = article["name"] + "_" + figId
#         articleFigures.append(figName)
#         if figName not in figures:
#             figTitle = fig.split(":")[1].strip()
#             figures[figName] = {
#                 "name": figName,
#                 "title": figTitle,
#                 "articles": [article["name"]],
#             }
#         elif article["name"] not in figures[figName]["articles"]:
#             figures[figName]["articles"].append(article["name"])


Figure 1: UK claimant count (thousands)
Figure 2: Change in vacancy postings, 2019-2020 (%)


In [337]:
soup.findAll("div",{})

[<h4>Figure 1: UK claimant count (thousands)</h4>,
 <h4>Figure 2: Change in vacancy postings, 2019-2020 (%)</h4>]

Update data on disk

In [323]:
open("articles.json", "w").write(json.dumps(list(articles.values())))

4154880

In [324]:
for author in authors:
    authors[author]["observatory"] = "https://www.economicsobservatory.com/" + re.sub(
        " ", "-", authors[author]["name"].lower()
    )
open("authors.json", "w").write(json.dumps(list(authors.values())))

430887

In [325]:
open("figures.json", "w").write(json.dumps(list(figures.values())))

137858