### This notebook is used to collect all the time series id in order to later load time series

In [2]:
from time import sleep
import json
from selenium import webdriver
from urllib.parse import urlencode
from selenium.webdriver.chrome.options import Options
from functools import wraps
from copy import deepcopy
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings('ignore')

## additionally one need to install webdriver for browser

## we used Google Chrome and download here https://chromedriver.chromium.org/downloads

## loaded driver one need to put in /usr/bin (for linux) in order to browser could see webdriver

In [3]:
response_data = [
                 'categories',
                 'seriess',
                 'tags',
                 'releases',
                 'release_dates',
                 'sources',
                 'vintage_dates',
                 'observations'
                 ]

dates = [
        'realtime_start',
        'realtime_end',
        'date',
        'vintage_dates',
        'last_updated',
        'observation_start',
        'observation_end',
        'created'
        ]

url_root = 'https://api.stlouisfed.org/fred'

### First of all, in order to work with fred, one need to request different api keys from fred site (https://research.stlouisfed.org/useraccount/apikey)

In [10]:
# api_keys = ['abcdefghijklmnopqrstuvwxyz123456'] - just an example
api_keys = ['YOUR-API-KEY']

In [11]:
# as a base for this functions we used https://github.com/avelkoski/FRB

# function to preprocesss pandas dataframe
def _convert(frame):
    frame = frame.apply(pd.to_numeric, errors='ignore')
    for column in frame:
        if column in dates:
            frame[column] = pd.to_datetime(frame[column], utc=True)
    return frame

# function to convert response to pandas dataframe
def _data_frame(content):
    response = json.loads(content)
    key = [x for x in response.keys() if x in response_data][0]
    frame = pd.DataFrame(response[key])
    final_frame = _convert(frame)
    return final_frame

# function to request page by url
def _fetch(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    content = driver.page_source[84:-20] # in order to parse data
    driver.quit()
    return content

# function
def _url_builder(url_root, path, params):
    # api key can be easily requested on the fred site
    # random choice over different keys is used in order to avoid firewalls
    api_key = np.random.choice(api_keys)
    params['api_key'] = api_key
    url_end = urlencode(params)
    url = "%s%s%s" % (url_root,path,url_end)
    return url

# function to get request
def _get_request(url_root, path, params):
    url = _url_builder(url_root,path,params)
    content = _fetch(url)
    response = _data_frame(content)
    return response

# function to get subcategories of a category
def children(category_id=None, params={}):
    path='/category/children?'
    params['category_id'] = category_id
    params['file_type'] = 'json'
    response = _get_request(url_root, path, params)
    return response

# function to get time series of a category
def series(category_id=None, params={}):
    path = '/category/series?'
    params['category_id'] = category_id
    params['file_type'] = 'json'
    response = _get_request(url_root, path, params)
    return response

In [12]:
# function to get is of subcategories
def get_subcategories_id(category):
    childrens = children(category)
    res = pd.Series()
    if not childrens.empty:
        res = childrens['id']
    return res


# get id of a time serieses of a category 
def get_category_series(category):
    seriess = series(category)
    res = pd.DataFrame()
    if not seriess.empty:
        res = seriess[['id', 'frequency_short']]
    return res

In [20]:
# main categories obtained manually
# https://fred.stlouisfed.org/categories
# e.g. https://fred.stlouisfed.org/categories/32991
main_categories = [32991, 10, 32992, 1, 32455, 32263, 3008, 33060]

# daily, weekly, monthly, quarterly, annualy
valid_freq = {'D', 'W', 'M', 'Q', 'A'}

In [21]:
categ_series = {}
loaded_cat_series = []
seriess = pd.DataFrame()

In [28]:
# loop over all categories and subcategories
# goal is to obtain series ids in order to later load time serieses

# this function is like breadth-first search over tree
# we iterate over main categories and collect all subcategories level by level

for category in main_categories: #first categories are done, they are empty
    categ_series = {}
    print(f'Start over new category: {category}')
    if category not in loaded_cat_series:
        seriess = seriess.append(get_category_series(category))
        loaded_cat_series.append(category)
        sleep(5)
    print(category)
    subcategories = get_subcategories_id(category)
    i = 0
    while not subcategories.empty:
        new_sub_cat = pd.Series()
        for subcat in subcategories:
            # 33843 and 33845 are too old. datetime for them doesn't work
            if subcat not in loaded_cat_series and subcat != 33845 and subcat != 33843:
                print(subcat)
                seriess = seriess.append(get_category_series(subcat))
                sleep(max(5, 5 + np.random.normal()))
                loaded_cat_series.append(subcat)
                print("new seriess loaded")
            new_sub_cat = new_sub_cat.append(get_subcategories_id(subcat))
            sleep(max(5, 5 + np.random.normal()))
        print(f"{i} level of category {category} is done")
        print(f"Number of subcategories {len(subcategories)}")
        print(f"Number of time series {len(seriess)}")
        print()
        subcategories = new_sub_cat.copy()
        i += 1
    sleep(5)

    print('Number of seriess', len(seriess))
    print()
    if not seriess.empty:
        categ_series[category] = list(zip(seriess['id'], seriess['frequency_short']))

        with open(f'categ_series_{category}.json', 'w') as fp:
            json.dump(categ_series, fp)

Start over new category: 10
10
11
new seriess loaded
32250


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

#### if you get error "JSONDecodeError: Expecting value: line 1 column 1 (char 0)" - that is absolutely ok, that is because of firewalls, which blocks the loading, despite the fact we use Selenium (with BS4 or urllib it is totally impossible to collect fred). 

#### you need just to restart the code many times, as we did

#### finally, we collected id for all available times series on fred

#### you can also add new api_key

#### it took us 5 days to load all ids of time seriess on fred