In [79]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
from os import path
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

import datetime
from datetime import datetime as extra_datetime
import time

import requests
import apimoex
import time
import pandas_market_calendars as mcal

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

FMP_API_KEY = "uILltAaGY2ms0reL0RVtgtALlh2BbYH5"
PATH = "C:\Program Files (x86)\chromedriver.exe" # driver path
parent_dir = r"C:\Users\Никита\Андан\Project\data" # root for data 

## Stock quotes parser
API MOEX and US markets (through FMP API)

In [8]:
# |sector|country|ticker|...|
companies = pd.read_csv('companies_list.csv')

In [None]:
companies_ru = companies[companies.country == 'RU']
companies_ru.shape

In [None]:
def parse_tickers(companies, fmp_api_keys):
    '''
    This function is needed for parsing stock quotes for the studied period of time.
    '''
    # Sets the trading board mode as 'TQBR' which is intended for highly liquid and capitalized shares.
    board = 'TQBR'
    today = str(datetime.date.today())

    # Russian companies are filtered from a given dataset and subsequently code iterates through them
    companies_ru = companies[companies.country == 'RU']
    with requests.Session() as session:
        for indx, row in tqdm(companies_ru.iterrows(), desc = 'Processing russian stock', total = companies_ru.shape[0]):
            
            # To retrieve the trading history for a specified security in a given trading mode over a specified date range.
            # Important to specify the "internet connection session", the ticker of the security, and the trading mode (by default T+2)
            # Result: A list of dictionaries that can be directly converted into a pandas.DataFrame
            data = apimoex.get_board_history(session, row.ticker, board=board)
            
            if data == []:
                print('Empty set')
            df = pd.DataFrame(data)
            
            # Depending on the company's sector, code saves the DataFrame to a specific CSV file 
            # categorized by sector, which facilitates easy data management and access.
            if row.sector == 'Renewable Energy':
                df.to_csv(path.join(parent_dir, 'renewable_energy', f'{row.ticker}_RU.csv'))
            elif row.sector == 'Healthcare':
                df.to_csv(path.join(parent_dir, 'healthcare_services', f'{row.ticker}_RU.csv'))
            elif row.sector == 'Financial Services':
                df.to_csv(path.join(parent_dir, 'fintech', f'{row.ticker}_RU.csv'))
            elif row.sector == 'Industrials':
                df.to_csv(path.join(parent_dir, 'industrial_goods', f'{row.ticker}_RU.csv'))
    
    # The function separates American companies from the dataset
    companies_usa = companies[companies.country == 'USA']
    
    # Function get_data fetches historical stock data from the FMP API 
    # for a given ticker, handling API key rotation upon limit reaching.
    def get_data(ticker, today):
        for key in fmp_api_keys:
            try:
                link = f"https://financialmodelingprep.com/api/v3/historical-chart/4hour/{ticker}?to={today}&apikey={key}"
                return pd.DataFrame(requests.get(link).json(), index=[0])
            except requests.HTTPError as e:
                print(f"API key limit reached for {key}, switching keys.")
                continue
            
    for indx, row in tqdm(companies_usa.iterrows(), desc='Processing USA stock', total=companies_usa.shape[0]):
        try:
            df = get_data(row.ticker, today)
        except ValueError as e:
            print(e)
            break
            

        if row.sector == 'Renewable Energy':
            df.to_csv(path.join(parent_dir, 'renewable_energy', f'{row.ticker}_USA.csv'))
        elif row.sector == 'Healthcare':
            df.to_csv(path.join(parent_dir, 'healthcare_services', f'{row.ticker}_USA.csv'))
        elif row.sector == 'Financial Services':
            df.to_csv(path.join(parent_dir, 'fintech', f'{row.ticker}_USA.csv'))
        elif row.sector == 'Industrials':
            df.to_csv(path.join(parent_dir, 'industrial_goods', f'{row.ticker}_USA.csv'))
            


In [None]:
FMP_API_KEYS = [FMP_API_KEY_1, FMP_API_KEY_2]
parse_tickers(companies, FMP_API_KEYS)

Processing russian stock:   0%|          | 0/9 [00:00<?, ?it/s]

Processing USA stock:   0%|          | 0/340 [00:00<?, ?it/s]

Stopped for 25 hours


## USD/RUB parser
CBR XML

In [None]:
def parse_usdrub():
    '''
    This function is needed for parsing USD/RUB exchange rate for the studied period of time.
    '''
    # Dates in DD/MM/YYYY format
    start_date = datetime.date(2010, 1, 1).strftime('%d/%m/%Y')
    end_date = datetime.date.today().strftime('%d/%m/%Y')
    
    # URL creating for the further request
    url = f'https://www.cbr.ru/scripts/XML_dynamic.asp?date_req1={start_date}&date_req2={end_date}&VAL_NM_RQ=R01235'
    response_usd = requests.get(url)
    
    tree_usd_rate = BeautifulSoup(response_usd.content, 'html.parser')
    
    dates = []
    usd_rates = []
    
    # Forms two sets with dates and exchange rates, that will be used for final dataframe
    for line in tree_usd_rate.find_all('record'):
        dates.append(extra_datetime.strptime(line.get('date'), '%d.%m.%Y').date().strftime('%d.%m.%Y'))
        usd_rates.append(float(line.value.text.replace(',', '.')))
    
    usdrub = pd.DataFrame(data=usd_rates, index=pd.to_datetime(dates), columns=['usdrub'])
    usdrub_final = usdrub.sort_index()
    
    # Code saves the result to CSV file
    usdrub_final.to_csv(path.join(parent_dir, 'usdrub_rates', 'data_usdrub.csv'))

In [None]:
parse_usdrub()

## Trading calendar parsing

Below one can fing example of parsing trading calendar for NYSE stock market. Later it will be implemented in code.

In [None]:
!pip install pandas_market_calendars

In [22]:
start_date = '2014-01-01'
end_date = str(datetime.date.today())

nyse = mcal.get_calendar('NYSE')

In [37]:
# getting schedule of NYSE
nyse.schedule(start_date, end_date)

Unnamed: 0,market_open,market_close
2014-01-02,2014-01-02 14:30:00+00:00,2014-01-02 21:00:00+00:00
2014-01-03,2014-01-03 14:30:00+00:00,2014-01-03 21:00:00+00:00
2014-01-06,2014-01-06 14:30:00+00:00,2014-01-06 21:00:00+00:00
2014-01-07,2014-01-07 14:30:00+00:00,2014-01-07 21:00:00+00:00
2014-01-08,2014-01-08 14:30:00+00:00,2014-01-08 21:00:00+00:00
...,...,...
2024-04-30,2024-04-30 13:30:00+00:00,2024-04-30 20:00:00+00:00
2024-05-01,2024-05-01 13:30:00+00:00,2024-05-01 20:00:00+00:00
2024-05-02,2024-05-02 13:30:00+00:00,2024-05-02 20:00:00+00:00
2024-05-03,2024-05-03 13:30:00+00:00,2024-05-03 20:00:00+00:00


In [105]:
nyse.schedule(start_date, end_date).to_csv(path.join(parent_dir, 'trading_calendat_NYSE.csv'), \
                                                     index = False, encoding = 'utf-8')

In [28]:
# getting only holidays (exluding wekeends) 
nyse.holidays().holidays[-5:]

(numpy.datetime64('2200-06-19'),
 numpy.datetime64('2200-07-04'),
 numpy.datetime64('2200-09-01'),
 numpy.datetime64('2200-11-27'),
 numpy.datetime64('2200-12-25'))

In [34]:
# get iterable object of all trading days
nyse.valid_days(start_date, end_date)

DatetimeIndex(['2014-01-02 00:00:00+00:00', '2014-01-03 00:00:00+00:00',
               '2014-01-06 00:00:00+00:00', '2014-01-07 00:00:00+00:00',
               '2014-01-08 00:00:00+00:00', '2014-01-09 00:00:00+00:00',
               '2014-01-10 00:00:00+00:00', '2014-01-13 00:00:00+00:00',
               '2014-01-14 00:00:00+00:00', '2014-01-15 00:00:00+00:00',
               ...
               '2024-04-23 00:00:00+00:00', '2024-04-24 00:00:00+00:00',
               '2024-04-25 00:00:00+00:00', '2024-04-26 00:00:00+00:00',
               '2024-04-29 00:00:00+00:00', '2024-04-30 00:00:00+00:00',
               '2024-05-01 00:00:00+00:00', '2024-05-02 00:00:00+00:00',
               '2024-05-03 00:00:00+00:00', '2024-05-06 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', length=2603, freq=None)

## S&P500 and US GDP

In [None]:
!pip install alpha_vantage

In [None]:
# limitation: 25 requests per day
key = 'LYET8836ZF7IKTSV'

In [73]:
from alpha_vantage.timeseries import TimeSeries

def USA_index_parsing(save_path, key):
    '''
    save_path: str or path-object - path to save parsed file
    key: str - api_key for AlphaVantageAPI

    returns
    saves S&P500 historical prices up to date as a csv file 
    '''
    ts = TimeSeries(key) # special alpha vantage api object

    # Getting close, high, low, open and volume for S&P500 
    # all available historical data
    
    data, meta_data = ts.get_daily(symbol='SPY', outputsize='full')
    df = pd.DataFrame(data).T.reset_index().rename({'index': 'date'}, axis= 1)
    for column in df.columns:
        if column[0].isdigit():
            df.rename({column: column[3:]}, axis = 1, inplace = True)

    df.to_csv(path.join(save_path, 'S&P500.csv'), \
              index = False, encoding = 'utf-8')
    print('Parsing is done!')

In [74]:
def parse_usa_gdp(save_path, key):
    '''
    save_path: str or path-object - path to save parsed file
    key: str - api_key for AlphaVantageAPI

    returns
    saves real US GDP historical values (quarterly data) as a csv file 
    '''
    # Getting real US GDP through link alpha vantage api
    url = f'https://www.alphavantage.co/query?function=REAL_GDP&interval=quarterly&apikey={key}'
    r = requests.get(url)
    data = r.json()
    
    df = pd.DataFrame(data)
    df['date'] = df['data'].apply(lambda x: x['date'])
    df['value'] = df['data'].apply(lambda x: x['value'])
    df.drop('data', axis = 1).to_csv(path.join(save_path, 'GDP_USA.csv'), \
                                     index = False, encoding = 'utf-8')
    print('Parsing is done!')

In [75]:
# example of usage
save_path = r'C:\Users\Никита\Андан\Project\data\macro'
USA_index_parsing(save_path, key)
parse_usa_gdp(save_path, key)

Parsing is done!
Parsing is done!


## EFFR

Federal Funds Effective Rate. [More about it](https://www.newyorkfed.org/markets/reference-rates/effr)

In [102]:
def parse_effr(driver_path, save_path):
    '''
    driver_path: str or path-object - path to webriver for Selenium
    save_path: str or path-object - path to save parsed file

    returns 
    saves EFFR monthly data as a csv file
    '''

    # extracting link from FRED website
    driver = webdriver.Chrome(driver_path)
    try:
        driver.get('https://fred.stlouisfed.org/series/FEDFUNDS')
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "download-button"))
        )
        element.click()
    
        time.sleep(5)
        html_content = driver.page_source
        driver.quit()
    except:
        driver.quit()

    link = BeautifulSoup(html_content, 'html.parser').\
    find_all('a', {'class': 'dropdown-item fg-download-csv-chart-gtm fg-download-gtm', \
                   'id': "download-data-csv"})[0].\
                                                get('href')

    # parsing interest rates
    responce = requests.get(r'https://fred.stlouisfed.org' + link)
    with open(path.join(save_path, 'EFFR.csv'), 'wb') as file:
        file.write(responce.content)
    
    print('Parsing is done!')

In [103]:
parse_effr(PATH, save_path)

Parsing is done!
