<a href="https://colab.research.google.com/github/Firenze11/finance_lm/blob/main/llama_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import os
import re
import time
from typing import List, Dict
from base64 import b64decode
from urllib.parse import urlencode, urlparse
import requests
from requests import Response
from bs4 import BeautifulSoup
from google.colab import drive

LOGIN_URL = 'https://marquee.gs.com/tokenLogin'
HOME_URL = 'https://marquee.gs.com/content/themes/equity---americas.html'
BASE_URL = 'https://marquee.gs.com/'
QUERY_URL = 'https://marquee.gs.com/research/search/reports/query-components'
QUERY_URL_TAB = 'https://marquee.gs.com/research/search/reports/advanced-search'

drive.mount('/content/drive')
DATA_DIR = '/content/drive/My Drive/dgm_data/scraping_gs/'
DATA_DIR_EVAL = DATA_DIR + 'eval/'

# Scraping Process:
# √ Step 0: authenticate to sign in.
# √ Step 1: go to the HOME_URL page and scrape for main page urls, each of them corresponding to a content category such as "americas equity", "macro economy" etc.
# √ Step 2: find items with queries from each of the main pages.
#       these are "windows" on the webpage, usually marked as "react-query-manager", which individually sends API calls to get their contents.
# √ Step 3: base64 decode the queries and create query param items.
#       these will be sent to the QUERY_URL api as params, each will hopefully return one or more corresponding content pages
# √ Step 4: extract page urls and page titles from the contents returned above.
# √ Step 5: go to contents and read text

# TODO: add data-tab-queries (in https://marquee.gs.com/content/themes/equity---americas.html, most-popular-side-panel)

Mounted at /content/drive


In [2]:
def authenticate():
    # Open inspector and click "Network" tab before logging in to Marquee;
    # Log into Marquee;
    # Find the network record "tokenLogin", click its "Payload" tab;
    # Click "view source" and copy that data here. This token is good for 1 day.
    auth_payload = 'access_token=xxx'
    p = s.post(LOGIN_URL, data=auth_payload)
    return p

def get_top_level_page_links_GS(home_page):
    """Step 1"""
    soup = BeautifulSoup(home_page.content, "html.parser")
    res = soup.find('div', class_="hamburger__wrapper").find_all('ul')  # ["Research", "Macro", "Equity", "Themes", "More"]

    useful_uls = res[2:]
    top_level_pages = set()
    for ul in useful_uls:
        links = ul.select('li > a', href=True)
        links = [lk['href'] for lk in links]
        top_level_pages = top_level_pages.union(set(links))

    return top_level_pages

In [3]:
def get_queries_from_page(page: Response):
    """Step 2"""
    soup = BeautifulSoup(page.content, "html.parser")
    # elements = soup.find_all(True, {'class': ['query-research-hero-full', 'query-panel', 'query-panel-x2', 'query-list']})
    query_managers = soup.find_all('react-query-manager')
    print('# of query managers', len(query_managers))

    qids = set()
    queries = []
    for qm in query_managers:
        if qm['data-id'] in qids:
            continue
        query_manager_warnings(qm)
        query = qm.get('data-query-request') or qm.get('data-query-request-tabs')

        query = json.loads(b64decode(query).decode('UTF-8'))
        if isinstance(query, list):
            queries += query
        else:
            queries.append(query)
        qids.add(qm['data-id'])

    for q in queries:
        if 'tabName' in q:
            del q['tabName']
        if 'includeFocus' in q:
            del q['includeFocus']

    return json.dumps(queries, separators=(',', ':'))

def get_report_links_headline_from_page(report_link_queries: str) -> List[Dict]:
    """Step 3, return a list of (article url, article title) pairs
    Params:
    - report_link_queries: stringified version of query params gotten from a top-level page
    """
    # report_link_queries = '[{"id":"9c338746-6f30-4701-8ef6-5cd8b453580b","query":"https://publishing.gs.com/content/research/site/search.html?facets=()&language=%5B%22en%22%2C%22ja%22%5D&page=1&sort=time&limitTo=%5B%22%22%5D&filter=(publications%20EQ%20%24%7B(%2272d93e79-adc9-4ab9-9897-7df7eef9dc11%22)%20OR%20(%2285f33ec3-153e-44bc-a675-1b881c975807%22)%7D%24%20AND%20totalPages%20IN%20%5B1%2C99999%5D)","rank":0,"minNotability":0,"filterType":"none","filterTags":["research:curated/70d7db8c-74fd-4c7c-a279-09893dbaabab"],"requiredProperties":["publicationDateTime","path","distributionHeadline","totalPages","discipline","authors","source","icon","restrictionDetails","coverFlag","videoReport","podcastReport","channel","sourceDisplayName","media","authorsId","reportTypes"],"sortOrder":"publicationDateTime","maxAgeHours":3,"maxReports":1,"reportsToFetchFromSOLR":0,"disableViewMoreLink":true},{"id":"c8c9c3ee-4d64-46ff-b9ef-00fbf20d9b93","query":"https://publishing.gs.com/content/research/site/search.html?facets=()&language=%5B%22en%22%5D&page=1&sort=time&limitTo=%5B%22%22%5D&filter=(publications%20EQ%20%24%7B(%22d036cb0f-b058-490d-9e12-e7b8862ebdfd%22)%7D%24%20AND%20totalPages%20IN%20%5B1%2C99999%5D)","rank":0,"minNotability":0,"filterType":"none","filterTags":["research:curated/70d7db8c-74fd-4c7c-a279-09893dbaabab"],"requiredProperties":["path","distributionHeadline","totalPages","publicationDateTime","authors","source","discipline","synopsis","coverFlag","restrictionDetails","reportTypes","videoReport","podcastReport","channel","sourceDisplayName","media","authorsId","featuredExhibit"],"sortOrder":"publicationDateTime","maxAgeHours":3,"maxReports":1,"reportsToFetchFromSOLR":0,"disableViewMoreLink":false}]'

    contents = s.post(
        QUERY_URL,
        data={'q': report_link_queries},
        headers={
          "accept": "application/json;charset=UTF-8",
          "content-type": "application/x-www-form-urlencoded;charset=UTF-8"
        })

    query_results = contents.json()['results']
    print('# of query results:', len(query_results))
    return query_results

def get_report_links_headline_from_query_result(query_results):
    """Step 4"""
    # report_links_headline = get_report_links_headline_from_query_result(query_results)
    # print('report_links_headline', report_links_headline)
    # return report_links_headline

    links = set()
    links_and_titles = []
    for r in query_results:
        reports = r['reports']
        l_and_t = [(rr['path'], rr['distributionHeadline']) for rr in reports]
        links_and_titles += dedup_add(l_and_t, existing_keys=links)

    return links_and_titles

def dedup_add(items, existing_keys=None):
    if existing_keys is None:
        existing_keys = set()
    res = []
    for item in items:
        item_key = item[0]
        if item_key not in existing_keys:
            res.append(item)
            existing_keys.add(item_key)
    return res

def query_manager_warnings(qm):
    if qm['data-search-base-url'] != 'https://publishing.gs.com':
        print("Id:", qm['data-id'], 'search-base-url =', qm['data-search-base-url'])
    if qm['data-query-component-service-path'] != '/research/search/reports/query-components':
        print("Id:", qm['data-id'], 'query-component-service-path =', qm['data-query-component-service-path'])


In [4]:
link_template = r'^\/content\/(.+\/en\/.+)\.html$'

def get_page_content(page, link, headline):
    """Step 5"""
    soup = BeautifulSoup(page.content, "html.parser")
    paragraphs = [headline]

    sections = soup.find_all('div', class_='chapter parsys section')
    if len(sections):
        # when page is using explicit html
        for section in sections:
            # remove charts
            for child in section.find_all("div", class_='exhibit'):
                child.decompose()
            paragraphs.append(section.get_text())
    else:
        # when page is using dynamic html
        section = soup.find('mq-gsp-article') or soup.find('react-dynamic-report-body')
        if not section:
            print('Warning: missing section from', link)
        else:
            soup2 = BeautifulSoup(section.attrs['data-contents'], "html.parser")
            paragraphs.append(soup2.get_text())

    content = '\n'.join(paragraphs)
    # replace multiple newlines with one newline
    content = re.sub(r'\n+', '\n', content)

    m = re.search(link_template, link)
    filename = m.group(1) if m else link
    filename = filename.replace('/', '_')
    return content, filename

# get_page_content(requests, '/content/research/en/reports/2023/10/17/05adc95d-0e4b-4982-bfed-6309d24999fe.html', 'fake_title')

In [12]:
report_links_train = set(['/content/' + fn.replace('_', '/') + '.html' for fn in os.listdir(DATA_DIR)])
report_links_eval = set(['/content/' + fn.replace('_', '/') + '.html' for fn in os.listdir(DATA_DIR_EVAL)])
report_links_all = report_links_train.union(report_links_eval)
len(report_links_all) #, list(report_links_all)[0]

594

In [13]:
def scrape_page(page, existing_links=None, top_link=None, save_dir=DATA_DIR):
    print('Scraping page:', top_link)
    # step 2
    report_link_queries = get_queries_from_page(page)
    # step 3
    query_results = get_report_links_headline_from_page(report_link_queries)
    # step 4
    report_links_headline = get_report_links_headline_from_query_result(query_results)
    report_links_headline_deduped = dedup_add(report_links_headline, existing_keys=existing_links)
    print('# of new links:', len(report_links_headline_deduped))

    for link, headline in report_links_headline_deduped:
        page = s.get(BASE_URL + link)
        content, filename = get_page_content(page, link, headline)

        with open(save_dir + filename, 'w') as f:
            f.write(content)
        time.sleep(3)


# Use 'with' to ensure the session context is closed after use.
def scrape(save_dir=DATA_DIR):
    with requests.Session() as s:
        authenticate()

        # The following requests are authorized
        home_page = s.get(HOME_URL)

        top_level_pages_links = get_top_level_page_links_GS(home_page)
        top_level_pages_links = list(set(top_level_pages_links))
        for p in top_level_pages_links:
            print('-', p)
        scrape_page(home_page, existing_links=report_links_all, top_link=HOME_URL, save_dir=save_dir)

        for top_link in top_level_pages_links:
            page = s.get(BASE_URL + top_link)
            scrape_page(page, existing_links=report_links_all, top_link=top_link, save_dir=save_dir)


scrape(save_dir=DATA_DIR_EVAL)   # or DATA_DIR

- /content/research/themes/theme-index.html
- /content/research/themes/equity---americas.html
- /content/research/themes/author-list.html
- /content/subjects/4b43603f-75c3-40db-84ff-a4e4403105ea.html
- /content/research/themes/equity---europe.html
- /content/research/girDisciplines/093b6e28-d6fe-11df-a204-00118563711b.html
- /content/research/themes/stock-screener.html
- /content/subjects/3372a191-6092-437e-9d7d-832d376eddde.html
- /content/themes/gs-proprietary-indicators.html
- /content/research/girDisciplines/7c8f0740-d6fe-11df-a204-00118563711b.html
- /content/research/themes/sector-industry.html
- /content/research/girDisciplines/394de802-d6fe-11df-a204-00118563711b.html
- /content/research/subjects/5beda2f0-7557-41c4-ace3-c513e980d94e.html
- /content/research/themes/faq.html
- /content/themes/themes-tracker.html
- /content/research/themes/publications.html
- /content/girDisciplines/f851f0e6-d6fd-11df-a204-00118563711b.html
- /content/themes/conviction-list.html
- /content/researc

In [None]:
# section = soup.find('mq-gsp-article') or soup.find('react-dynamic-report-body')
# s = BeautifulSoup(section.attrs['data-contents'], "html.parser")
# s.get_text()

paragraphs = []

sections = soup.find_all('div', class_='chapter parsys section')
sections
# if len(sections):
#     # when page is using explicit html
#     for section in sections:
#         # remove charts
#         for child in section.find_all("div", class_='exhibit'):
#             child.decompose()
#         paragraphs.append(section.get_text())
# content

In [None]:
import os

for filename in os.listdir(DATA_DIR):
        # /content/research/en/reports/2023/10/17/05adc95d-0e4b-4982-bfed-6309d24999fe.html
    if not filename.startswith('research'):
        continue
    filepath = os.path.join(DATA_DIR, filename)
    with open(filepath, 'r') as f:
        s = f.read()
        s = re.sub(r'\n+', '\n', s)
    with open(filepath, 'w') as f:
        f.write(s)

        # filepath_new = os.path.join(DATA_DIR, filename.replace('research_en_reports', 'research_en_reports_'))
        # os.rename(filepath, filepath_new)

'Europe Technology: Hardware: Reiterate our view of a robust semis content opportunity in Automated Driving (ADAS)\nFollowing a deep-dive report by our global colleagues, in which they highlight their forecasts for dollar content of software in cars ranging from US$202 (L0) to US$4,957 (L4) in 2030E, we reiterate our view that Automated Driving systems (ADAS) that incorporate intelligence into the car will drive a robust multi-year market opportunity for semiconductors, given the role of ADAS tech as a key Digital Enabler. Therefore, while we continue to see a solid backdrop for semis in 1H22, we believe automotive digitalisation, as encapsulated in the trend of accelerating adoption of ADAS, will provide a structural tailwind for players such as Infineon (Buy) and STMicro (Neutral).\nOur forecasts for successive increases in the value of semis $ content per car for each of the various levels of automation are, broadly speaking, driven by factors such as: (1) the greater prevalence of 

In [18]:
def move_new_files():
    separate_ts = 1702080000  # 2023-12-09
    for fname in os.listdir(DATA_DIR):
        fpath = os.path.join(DATA_DIR, fname)
        if not os.path.isfile(fpath):
            continue
        modtime = os.stat(fpath).st_mtime
        if modtime > separate_ts:
            os.rename(fpath, os.path.join(DATA_DIR_EVAL, fname))
            print('Moved:', fname)

def move_files_to_train():
    for fname in os.listdir(DATA_DIR):
        fpath = os.path.join(DATA_DIR, fname)
        if not os.path.isfile(fpath):
            continue
        os.rename(fpath, os.path.join(DATA_DIR, 'train', fname))

# move_new_files()
move_files_to_train()

'\nIn the last several weeks, we have seen a notable deterioration in investor sentiment around several retail names due to concerns around the overall health of the consumer as student loan payments resume and as gas prices tick higher. However, the GS house view is still constructive based on a still-strong labor market, which we illustrate in our most recent discretionary cash flow model here.\nAs we look into the end of the year and into FY24, we are repositioning our ratings to highlight names that have:\nExposure to value offerings \nEasier compares on a multi-year stack\nValuations that are at historic lows\n        Summary of ratings changes\nUpgrade to Buy: DLTR, OLLI\nDowngrade to Neutral: BJ, EYE\n        Why we maintain our Buy ratings on Dollar General and Target\nDollar General: We think issues are more cyclical than secular – DG is the most discussed stock we cover currently. Questions continue to center on if DG\'s issues are fixable or more secular. While we are loweri

In [None]:
def get_report_links_headline_from_page2(report_link_queries: str) -> List[Dict]:
    """Step 3, return a list of (article url, article title) pairs
    Params:
    - report_link_queries: stringified version of query params gotten from a top-level page
    """
    # report_link_queries = '[{"id":"9c338746-6f30-4701-8ef6-5cd8b453580b","query":"https://publishing.gs.com/content/research/site/search.html?facets=()&language=%5B%22en%22%2C%22ja%22%5D&page=1&sort=time&limitTo=%5B%22%22%5D&filter=(publications%20EQ%20%24%7B(%2272d93e79-adc9-4ab9-9897-7df7eef9dc11%22)%20OR%20(%2285f33ec3-153e-44bc-a675-1b881c975807%22)%7D%24%20AND%20totalPages%20IN%20%5B1%2C99999%5D)","rank":0,"minNotability":0,"filterType":"none","filterTags":["research:curated/70d7db8c-74fd-4c7c-a279-09893dbaabab"],"requiredProperties":["publicationDateTime","path","distributionHeadline","totalPages","discipline","authors","source","icon","restrictionDetails","coverFlag","videoReport","podcastReport","channel","sourceDisplayName","media","authorsId","reportTypes"],"sortOrder":"publicationDateTime","maxAgeHours":3,"maxReports":1,"reportsToFetchFromSOLR":0,"disableViewMoreLink":true},{"id":"c8c9c3ee-4d64-46ff-b9ef-00fbf20d9b93","query":"https://publishing.gs.com/content/research/site/search.html?facets=()&language=%5B%22en%22%5D&page=1&sort=time&limitTo=%5B%22%22%5D&filter=(publications%20EQ%20%24%7B(%22d036cb0f-b058-490d-9e12-e7b8862ebdfd%22)%7D%24%20AND%20totalPages%20IN%20%5B1%2C99999%5D)","rank":0,"minNotability":0,"filterType":"none","filterTags":["research:curated/70d7db8c-74fd-4c7c-a279-09893dbaabab"],"requiredProperties":["path","distributionHeadline","totalPages","publicationDateTime","authors","source","discipline","synopsis","coverFlag","restrictionDetails","reportTypes","videoReport","podcastReport","channel","sourceDisplayName","media","authorsId","featuredExhibit"],"sortOrder":"publicationDateTime","maxAgeHours":3,"maxReports":1,"reportsToFetchFromSOLR":0,"disableViewMoreLink":false}]'

    contents = s.post(
        QUERY_URL_TAB,
        data={'q': report_link_queries},
        headers={
          "accept": "application/json;charset=UTF-8",
          "content-type": "application/prs.gir-search-service.v3+json;charset=UTF-8"
        })

    query_results = contents.json()#['results']
    print('query_results', query_results)
    # print('# of query results:', len(query_results))
    return query_results

In [None]:
query_managers = soup.find_all('react-most-popular')
# query_managers = soup.find_all('react-query-manager')
print('# of query managers', len(query_managers))

qids = set()
queries = []
for qm in query_managers:
    # if qm['data-id'] in qids:
    #     continue
    # query_manager_warnings(qm)

    # query = qm.get('data-query-request') or qm.get('data-query-request-tabs')
    query = qm.get('data-tab-queries')

    query = json.loads(b64decode(query).decode('UTF-8'))
    if isinstance(query, list):
        queries += query
    else:
        queries.append(query)
    # qids.add(qm['data-id'])

for q in queries:
    if 'tabName' in q:
        del q['tabName']
    if 'includeFocus' in q:
        del q['includeFocus']
    if 'timePeriod' in q:
        del q['timePeriod']
    if 'threshold' in q:
        del q['threshold']
    if 'searchQuery' in q:
        q['query'] = q['searchQuery']
        del q['searchQuery']

qs = json.dumps(queries, separators=(',', ':'))

res = get_report_links_headline_from_page2(qs)

# of query managers 2
query_results {'timestamp': '2023-11-15T17:49:59.932+00:00', 'status': 406, 'error': 'Not Acceptable', 'path': '/search/reports/advanced-search'}


In [None]:
qs

'[{"id":"2eae8da7-ef3d-4079-920a-1149be3a5e9b","query":"https://publishing.gs.com/content/research/site/search.html?facets=()&language=%5B%22en%22%2C%22ja%22%5D&page=1&sort=time&limitTo=%5B%22%22%5D&filter=(publications%20EQ%20%24%7B(%2272d93e79-adc9-4ab9-9897-7df7eef9dc11%22)%20OR%20(%2285f33ec3-153e-44bc-a675-1b881c975807%22)%7D%24%20AND%20totalPages%20IN%20%5B1%2C99999%5D)","rank":0,"minNotability":0,"filterType":"none","filterTags":["research:curated/70d7db8c-74fd-4c7c-a279-09893dbaabab"],"requiredProperties":["publicationDateTime","path","distributionHeadline","totalPages","discipline","authors","source","icon","restrictionDetails","coverFlag","videoReport","podcastReport","channel","sourceDisplayName","media","authorsId","reportTypes"],"sortOrder":"publicationDateTime","maxAgeHours":3,"maxReports":1,"reportsToFetchFromSOLR":0,"disableViewMoreLink":true},{"id":"f3edd000-e7b0-438c-8a6f-e71fa4451d34","query":"https://publishing.gs.com/content/research/site/search.html?facets=()&lang

In [None]:
st = "W3sidGltZVBlcmlvZCI6IjRoIiwic2VhcmNoUXVlcnkiOiJodHRwczovL3B1Ymxpc2hpbmcuZ3MuY29tL2NvbnRlbnQvcmVzZWFyY2gvc2l0ZS9zZWFyY2guaHRtbD9mYWNldHM9KCkmbGFuZ3VhZ2U9JTVCJTIyZW4lMjIlNUQmcGFnZT0xJnNvcnQ9dGltZSZsaW1pdFRvPSU1QiUyMm1vZGVsJTIyJTVEJmZpbHRlcj0oZGlzY2lwbGluZXNfYW5kX2Fzc2V0cyUyMEVRJTIwJTI0JTdCKCUyMjdmYzczOTU2LWQ2ZmQtMTFkZi1hMjA0LTAwMTE4NTYzNzExYiUyMiklN0QlMjQlMjBBTkQlMjBzb3VyY2VzJTIwRVElMjAlMjQlN0IoJTIyZTNjM2FkNzctZmQ5OS00ZjMwLTg2MTQtNjZlODg3MDg3N2M5JTIyKSU3RCUyNCUyMEFORCUyMHJlZ2lvbnNfYW5kX2NvdW50cmllcyUyMEVRJTIwJTI0JTdCKCUyMjU1ZjIwODdlLTgyNWUtNDlkNC1hNjg4LTlkZmI0MjExM2Q2MiUyMiklN0QlMjQlMjBBTkQlMjB0b3RhbFBhZ2VzJTIwSU4lMjAlNUIxJTJDOTk5OTklNUQpIiwidGhyZXNob2xkIjowfSx7InRpbWVQZXJpb2QiOiIxZCIsInNlYXJjaFF1ZXJ5IjoiaHR0cHM6Ly9wdWJsaXNoaW5nLmdzLmNvbS9jb250ZW50L3Jlc2VhcmNoL3NpdGUvc2VhcmNoLmh0bWw/ZmFjZXRzPSgpJmxhbmd1YWdlPSU1QiUyMmVuJTIyJTVEJnBhZ2U9MSZzb3J0PXRpbWUmbGltaXRUbz0lNUIlMjJtb2RlbCUyMiU1RCZmaWx0ZXI9KGRpc2NpcGxpbmVzX2FuZF9hc3NldHMlMjBFUSUyMCUyNCU3QiglMjI3ZmM3Mzk1Ni1kNmZkLTExZGYtYTIwNC0wMDExODU2MzcxMWIlMjIpJTdEJTI0JTIwQU5EJTIwc291cmNlcyUyMEVRJTIwJTI0JTdCKCUyMmUzYzNhZDc3LWZkOTktNGYzMC04NjE0LTY2ZTg4NzA4NzdjOSUyMiklN0QlMjQlMjBBTkQlMjByZWdpb25zX2FuZF9jb3VudHJpZXMlMjBFUSUyMCUyNCU3QiglMjI1NWYyMDg3ZS04MjVlLTQ5ZDQtYTY4OC05ZGZiNDIxMTNkNjIlMjIpJTdEJTI0JTIwQU5EJTIwdG90YWxQYWdlcyUyMElOJTIwJTVCMSUyQzk5OTk5JTVEKSIsInRocmVzaG9sZCI6MH0seyJ0aW1lUGVyaW9kIjoiMXciLCJzZWFyY2hRdWVyeSI6Imh0dHBzOi8vcHVibGlzaGluZy5ncy5jb20vY29udGVudC9yZXNlYXJjaC9zaXRlL3NlYXJjaC5odG1sP2ZhY2V0cz0oKSZsYW5ndWFnZT0lNUIlMjJlbiUyMiU1RCZwYWdlPTEmc29ydD10aW1lJmxpbWl0VG89JTVCJTIyJTIyJTVEJmZpbHRlcj0oZGlzY2lwbGluZXNfYW5kX2Fzc2V0cyUyMEVRJTIwJTI0JTdCKCUyMjdmYzczOTU2LWQ2ZmQtMTFkZi1hMjA0LTAwMTE4NTYzNzExYiUyMiklN0QlMjQlMjBBTkQlMjBzb3VyY2VzJTIwRVElMjAlMjQlN0IoJTIyZTNjM2FkNzctZmQ5OS00ZjMwLTg2MTQtNjZlODg3MDg3N2M5JTIyKSU3RCUyNCUyMEFORCUyMHJlZ2lvbnNfYW5kX2NvdW50cmllcyUyMEVRJTIwJTI0JTdCKCUyMjU1ZjIwODdlLTgyNWUtNDlkNC1hNjg4LTlkZmI0MjExM2Q2MiUyMiklN0QlMjQlMjBBTkQlMjB0b3RhbFBhZ2VzJTIwSU4lMjAlNUIxJTJDOTk5OTklNUQpIiwidGhyZXNob2xkIjowfSx7InRpbWVQZXJpb2QiOiIxbSIsInNlYXJjaFF1ZXJ5IjoiaHR0cHM6Ly9wdWJsaXNoaW5nLmdzLmNvbS9jb250ZW50L3Jlc2VhcmNoL3NpdGUvc2VhcmNoLmh0bWw/ZmFjZXRzPSgpJmxhbmd1YWdlPSU1QiUyMmVuJTIyJTVEJnBhZ2U9MSZzb3J0PXRpbWUmbGltaXRUbz0lNUIlMjJtb2RlbCUyMiU1RCZmaWx0ZXI9KGRpc2NpcGxpbmVzX2FuZF9hc3NldHMlMjBFUSUyMCUyNCU3QiglMjI3ZmM3Mzk1Ni1kNmZkLTExZGYtYTIwNC0wMDExODU2MzcxMWIlMjIpJTdEJTI0JTIwQU5EJTIwc291cmNlcyUyMEVRJTIwJTI0JTdCKCUyMmUzYzNhZDc3LWZkOTktNGYzMC04NjE0LTY2ZTg4NzA4NzdjOSUyMiklN0QlMjQlMjBBTkQlMjByZWdpb25zX2FuZF9jb3VudHJpZXMlMjBFUSUyMCUyNCU3QiglMjI1NWYyMDg3ZS04MjVlLTQ5ZDQtYTY4OC05ZGZiNDIxMTNkNjIlMjIpJTdEJTI0JTIwQU5EJTIwdG90YWxQYWdlcyUyMElOJTIwJTVCMSUyQzk5OTk5JTVEKSIsInRocmVzaG9sZCI6MH0seyJ0aW1lUGVyaW9kIjoiMXkiLCJzZWFyY2hRdWVyeSI6Imh0dHBzOi8vcHVibGlzaGluZy5ncy5jb20vY29udGVudC9yZXNlYXJjaC9zaXRlL3NlYXJjaC5odG1sP2ZhY2V0cz0oKSZsYW5ndWFnZT0lNUIlMjJlbiUyMiU1RCZwYWdlPTEmc29ydD10aW1lJmxpbWl0VG89JTVCJTIyJTIyJTVEJmZpbHRlcj0oZGlzY2lwbGluZXNfYW5kX2Fzc2V0cyUyMEVRJTIwJTI0JTdCKCUyMjdmYzczOTU2LWQ2ZmQtMTFkZi1hMjA0LTAwMTE4NTYzNzExYiUyMiklN0QlMjQlMjBBTkQlMjBzb3VyY2VzJTIwRVElMjAlMjQlN0IoJTIyZTNjM2FkNzctZmQ5OS00ZjMwLTg2MTQtNjZlODg3MDg3N2M5JTIyKSU3RCUyNCUyMEFORCUyMHJlZ2lvbnNfYW5kX2NvdW50cmllcyUyMEVRJTIwJTI0JTdCKCUyMjU1ZjIwODdlLTgyNWUtNDlkNC1hNjg4LTlkZmI0MjExM2Q2MiUyMiklN0QlMjQlMjBBTkQlMjB0b3RhbFBhZ2VzJTIwSU4lMjAlNUIxJTJDOTk5OTklNUQpIiwidGhyZXNob2xkIjowfV0="


In [None]:
json.loads(b64decode(st).decode('UTF-8'))

[{'timePeriod': '4h',
  'searchQuery': 'https://publishing.gs.com/content/research/site/search.html?facets=()&language=%5B%22en%22%5D&page=1&sort=time&limitTo=%5B%22model%22%5D&filter=(disciplines_and_assets%20EQ%20%24%7B(%227fc73956-d6fd-11df-a204-00118563711b%22)%7D%24%20AND%20sources%20EQ%20%24%7B(%22e3c3ad77-fd99-4f30-8614-66e8870877c9%22)%7D%24%20AND%20regions_and_countries%20EQ%20%24%7B(%2255f2087e-825e-49d4-a688-9dfb42113d62%22)%7D%24%20AND%20totalPages%20IN%20%5B1%2C99999%5D)',
  'threshold': 0},
 {'timePeriod': '1d',
  'searchQuery': 'https://publishing.gs.com/content/research/site/search.html?facets=()&language=%5B%22en%22%5D&page=1&sort=time&limitTo=%5B%22model%22%5D&filter=(disciplines_and_assets%20EQ%20%24%7B(%227fc73956-d6fd-11df-a204-00118563711b%22)%7D%24%20AND%20sources%20EQ%20%24%7B(%22e3c3ad77-fd99-4f30-8614-66e8870877c9%22)%7D%24%20AND%20regions_and_countries%20EQ%20%24%7B(%2255f2087e-825e-49d4-a688-9dfb42113d62%22)%7D%24%20AND%20totalPages%20IN%20%5B1%2C99999%5D)',