In [224]:
import requests, pandas as pd, json, re
from bs4 import BeautifulSoup

## Parse list of articles

In [565]:
url = "https://www.economicsobservatory.com/answers/"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
nrOfPages = int(
    soup.find("section", {"class": "answers__listing"})
    .find("div", {"class": "pagination"})
    .find("span")
    .text.split(" of ")[1]
    .split(" ")[0]
)

In [566]:
articles = []
# articles = json.loads(open("articles.json", "r").read())
articles = {i["name"]: i for i in articles}

In [567]:
for page in range(1, nrOfPages + 1):
    url = "https://www.economicsobservatory.com/answers/page/" + str(page)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    for k in (
        soup.find("section", {"class": "answers__listing"})
        .find("div", {"class": "answers__listing-left"})
        .find("ul")
        .findAll("li")
    ):
        paragraphs = k.find("div").findAll("div")
        para0 = paragraphs[0].text.split(" • ")
        category = para0[0].strip()
        date = str(pd.to_datetime(para0[1]))[:10]
        title = paragraphs[1].find("a").text
        link = paragraphs[1].find("a")["href"]
        name = link.split("/")[-1]
        if name not in articles:
            articles[name] = {
                "name": name,
                "category": category,
                "date": date,
                "title": title,
                "url": link,
            }
    print(page, " of ", nrOfPages)

1  of  9
2  of  9
3  of  9
4  of  9
5  of  9
6  of  9
7  of  9
8  of  9
9  of  9


Make local backup or article metadata

In [568]:
open("articles.json", "w").write(json.dumps(list(articles.values())))

118909

## Parse individual articles

In [569]:
authors = []
# authors = json.loads(open("authors.json", "r").read())
authors = {i["name"]: i for i in authors}
figures = []
# figures = json.loads(open("figures.json", "r").read())
figures = {i["name"]: i for i in figures}

In [None]:
for a in articles:
    # for a in [
    #     "how-has-covid-19-affected-part-time-jobs",
    #     "what-is-the-likely-impact-of-advertising-restrictions-on-obesity",
    # ]:
    article = articles[a]
    if "text" not in article:
        # if True:

        # Load article page
        url = article["url"]
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")

        # Extract content
        relatedCategories = [
            i.find("a").text
            for i in soup.find("ul", {"class": "article__sidebar-categories"}).findAll(
                "li"
            )
        ]
        # articleIntro = soup.find("div", {"class", "article__intro"}).find("h3").text
        if soup.find("div", {"class", "article__intro"}):
            articleIntro = re.sub(
                "\xa0",
                " ",
                re.sub("\n", "", soup.find("div", {"class", "article__intro"}).text),
            ).strip()
        articleText = "\n".join(
            [i.text for i in soup.find("div", {"class", "article__body"}).findAll("p")]
        )

        # Extract authors
        articleAuthors = []
        if soup.find("ul", {"class": "article__authors-list"}):
            for authorData in soup.find(
                "ul", {"class": "article__authors-list"}
            ).findAll("li"):
                author = authorData.find("div", {"class": "title"}).text
                articleAuthors.append(author)
                if author not in authors:
                    affiliation = authorData.find("div", {"class": "aff"}).text
                    profile = authorData.find("a", {"class": "link"})["href"]
                    image = (
                        authorData.find("div", {"class", "image"})["style"]
                        .split("url")[1]
                        .split("'")[1]
                    )
                    authors[author] = {
                        "name": author,
                        "affiliation": affiliation,
                        "profile": profile,
                        "image": image,
                        "articles": [],
                        "expert": [],
                        "categories": [],
                        "related": [],
                    }
                if article["name"] not in authors[author]["articles"]:
                    authors[author]["articles"].append(article["name"])
                if article["category"] not in authors[author]["categories"]:
                    authors[author]["categories"].append(article["category"])
                for relatedCategory in relatedCategories:
                    if relatedCategory not in authors[author]["related"]:
                        authors[author]["related"].append(relatedCategory)

        # Extract figures
        fig0 = "Figure 1: "
        abc = "abcdefghijklmnopqrstuv"
        counter = 0
        articleFigures = []
        paras = [i for i in soup.find("div", {"class", "article__body"}).findAll(True)]
        # Check the next nk elements after the heading is found
        nk = 3
        for p in range(0, len(paras) - nk):
            if paras[p].name == "h4":
                fig = paras[p].text
                figFound = False
                # Check the next nk elements after the heading is found
                for k in range(1, nk + 1):
                    # Static images found
                    if paras[p + k].find("img"):
                        figSource = paras[p + k].find("img")["src"]
                        figEmbed = "img"
                        figType = "image"
                        figFound = True
                        break
                    # Interactive embeds found
                    if paras[p + k].name in ["section", "figure"]:
                        if not paras[p + k].find("table"):
                            if "wp-block-table" not in paras[p + k]["class"]:
                                if "blocks__html" in paras[p + k]["class"]:
                                    if paras[p + k].find("iframe"):
                                        figSource = paras[p + k].find("iframe")["src"]
                                        figEmbed = "iframe"
                                        figType = "d3plus"
                                    else:
                                        scriptText = str(
                                            paras[p + k].find("body").find("script")
                                        )
                                        figSource = scriptText[
                                            scriptText.find("http") : scriptText.find(
                                                "json"
                                            )
                                            + 4
                                        ]
                                        figEmbed = "iframe"
                                        figType = "vega-lite"
                                    figFound = True
                                    break
                                elif "wp-block-embed-youtube" in paras[p + k]["class"]:
                                    figSource = paras[p + k].find("iframe")["src"]
                                    figEmbed = "youtube-plugin"
                                    figType = "video"
                                    figFound = True
                                    break
                                elif (
                                    "blocks__chart-svg"
                                    in paras[p + k].find(True)["class"]
                                ):
                                    scriptText = str(paras[p + k].script)
                                    if "Plotly" in scriptText:
                                        figSource = scriptText[
                                            scriptText.find(">")
                                            + 1 : scriptText.find("</script")
                                        ].strip()
                                        figEmbed = "plotly-plugin"
                                        figType = "plotly"
                                    elif "var spec" in scriptText:
                                        figSource = json.loads(
                                            scriptText[
                                                scriptText.find("var spec")
                                                + 10 : scriptText.find("var view")
                                            ].strip()[:-1]
                                        )
                                        figEmbed = "vega-lite-plugin"
                                        figType = "vega-lite"
                                    figFound = True
                                    break
                if figFound:
                    if ":" not in fig:
                        fig = fig0.split(":")[0] + abc[counter] + ": " + fig
                        counter += 1
                    else:
                        fig0 = fig
                    figId = "fig" + fig.split(":")[0].split(" ")[1]
                    figName = article["name"] + "_" + figId
                    articleFigures.append(figName)
                    if figName not in figures:
                        figTitle = fig.split(":")[1].strip()
                        figures[figName] = {
                            "name": figName,
                            "title": figTitle,
                            "type": figType,
                            "source": figSource,
                            "embed": figEmbed,
                            "articles": [article["name"]],
                        }
                    elif article["name"] not in figures[figName]["articles"]:
                        figures[figName]["articles"].append(article["name"])

        # Extract experts
        articleExperts = []
        if soup.find("ul", {"class": "article__sidebar-experts"}):
            for authorData in soup.find(
                "ul", {"class": "article__sidebar-experts"}
            ).findAll("li"):
                author = authorData.find("div", {"class": "title"}).text
                articleExperts.append(author)
                if author not in authors:
                    affiliation = authorData.find("div", {"class": "aff"}).text
                    profile = authorData.find("a", {"class": "link"})["href"]
                    image = (
                        authorData.find("div", {"class", "image"})["style"]
                        .split("url")[1]
                        .split("'")[1]
                    )
                    authors[author] = {
                        "name": author,
                        "affiliation": affiliation,
                        "profile": profile,
                        "image": image,
                        "articles": [],
                        "expert": [],
                        "categories": [],
                        "related": [],
                    }
                if article["name"] not in authors[author]["expert"]:
                    authors[author]["expert"].append(article["name"])
                if article["category"] not in authors[author]["categories"]:
                    authors[author]["categories"].append(article["category"])
                for relatedCategory in relatedCategories:
                    if relatedCategory not in authors[author]["related"]:
                        authors[author]["related"].append(relatedCategory)

        # Augment article data
        article["authors"] = articleAuthors
        article["figures"] = articleFigures
        article["experts"] = articleExperts
        article["related"] = relatedCategories
        article["intro"] = articleIntro
        article["text"] = articleText

        print(article["name"])

Update data on disk

In [714]:
for a in articles:
    articles[a]['wordCount']=len(articles[a]['intro'].split(' '))+len(articles[a]['text'].split(' '))
    articles[a]['figureCount']=len(articles[a]['figures'])
    articles[a]['authorCount']=len(articles[a]['authors'])
    articles[a]['expertCount']=len(articles[a]['experts'])
open("articles.json", "w").write(json.dumps(list(articles.values())))

4198013

In [713]:
open("figures.json", "w").write(json.dumps(list(figures.values())))

1465824

Clean up author affiliations

In [654]:
unis = []
# uni = json.loads(open("unis.json", "r").read())
unis = {i["name"]: i for i in unis}

In [655]:
# v.append(process.extractOne(s, list(df.columns))[0])

In [656]:
# !pip install fuzzywuzzy

In [657]:
import numpy as np

In [706]:
def cleanAffiliation(name, aff, unis):
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process

    uni = []
    dept = []
    aff = (
        aff.strip()
        .replace("&", ",")
        .replace(" and ", ",")
        .replace("/", ",")
        .replace("’", "'")
        .replace("“", "")
        .replace("”", "")
        .replace(', Fullerton',' Fullerton')
        .replace('Cambridge Judge','Cambridge, Judge')
    )
    for a in aff.lower().split(","):
        a = a.strip().title()
        if len(a) < 5:
            a = a.upper()
        if a == "LSE":
            a = "London School of Economics"
        if a == "WEF":
            a = "World Economic Forum"
        if a=='CGD':
            a='Center For Global Development'
        elif "University" in a:
            if a=='Create Fellow In Cultural Economics (University Of Glasgow)':
                a='University Of Glasgow'
            uni.append(a)
            if a not in unis:
                unis[a] = {"name": a}
        elif a == "Oxford":
            uni.append("University of Oxford")
        elif a == "Cambridge":
            uni.append("University of Cambridge")
        elif "College" in a:
            if a == "Boston College":
                uni.append(a)
            else:
                dept.append(a)
        elif "School" in a:
            if a == "London School of Economics":
                uni.append(a)
            else:
                dept.append(a)
        elif "Centre" in a:
            dept.append(a)
        elif "Center" in a:
            dept.append(a)
        elif "Institut" in a:
            dept.append(a)
        elif "Department" in a:
            dept.append(a)
        elif a in ["World Economic Forum","Bank Of England",'Center For Global Development']:
            uni.append(a)
    return uni, dept, unis

In [708]:
# for i in np.sort(list(set([authors[a]["affiliation"] for a in authors]))):
#     uni, dept, unis = cleanAffiliation("", i, unis)
#     print(uni, dept,i )

In [711]:
for author in authors:
    authors[author]["observatory"] = "https://www.economicsobservatory.com/" + re.sub(
        " ", "-", authors[author]["name"].lower()
    )
    (
        authors[author]["institution"],
        authors[author]["department"],
        unis,
    ) = cleanAffiliation(authors[author]["name"], authors[author]["affiliation"], unis)
    authors[author]['articleCount']=len(authors[author]['articles'])
open("authors.json", "w").write(json.dumps(list(authors.values())))

515321