In [80]:
import os
import io
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Fetching Data

In [62]:
project_dir = os.path.split(os.getcwd())[0]
path_to_data = os.path.join(project_dir, 'data')

In [63]:
# Loading API keys
path_to_keys = os.path.join(project_dir, 'api_keys.json')
with open(path_to_keys, 'r') as f:
    api_keys = json.load(f)

In [64]:
ticker = 'INTC'
CIK = '0000050863'

## Fetching stock price data

In [65]:
# Stock price url and params
# url = 'https://www.quandl.com/api/v3/datasets/EOD/INTC.csv'
aa_url = 'https://www.alphavantage.co/query'
aa_params = {'function': 'TIME_SERIES_DAILY_ADJUSTED', 'symbol': ticker, 'datatype': 'csv',
             'apikey': api_keys['alphavantage'], 'outputsize': 'full'}


In [66]:
# Accessing data
aa_response = requests.get(aa_url, params=aa_params)
with open(os.path.join(path_to_data, 'raw_TS_prices.csv'), 'w') as f:
    f.write(aa_response.text)

In [67]:
# inspecting data
price_df = pd.read_csv(os.path.join(path_to_data, 'raw_TS_prices.csv'), parse_dates=['timestamp'])
price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5032 entries, 0 to 5031
Data columns (total 9 columns):
timestamp            5032 non-null datetime64[ns]
open                 5032 non-null float64
high                 5032 non-null float64
low                  5032 non-null float64
close                5032 non-null float64
adjusted_close       5032 non-null float64
volume               5032 non-null int64
dividend_amount      5032 non-null float64
split_coefficient    5032 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 353.9 KB


In [68]:
price_df.head()

Unnamed: 0,timestamp,open,high,low,close,adjusted_close,volume,dividend_amount,split_coefficient
0,2019-09-18,52.13,52.1,51.162,51.74,51.74,13034209,0.0,1.0
1,2019-09-17,52.05,52.14,51.35,51.95,51.95,19641100,0.0,1.0
2,2019-09-16,51.9,52.29,51.7,52.2,52.2,13354600,0.0,1.0
3,2019-09-13,52.76,53.0,52.23,52.54,52.54,18010800,0.0,1.0
4,2019-09-12,53.0,53.33,52.07,53.01,53.01,23308700,0.0,1.0


In [69]:
# data need from time series to fetch text data
start_date = price_df.timestamp.min()
print(start_date)

1999-09-20 00:00:00


## Fetching text data

In [70]:
import re
import datetime
import lxml
from bs4 import BeautifulSoup

In [71]:
edgar_url = 'https://www.sec.gov/cgi-bin/browse-edgar'
edgar_params = {'action': 'getcompany', 'CIK': CIK, 'type': '8-k',
                'owner': 'exclude', 'count': '100', 'output': 'atom', 'start': ''}

In [72]:
edgar_response = requests.get(edgar_url, params=edgar_params)
print(edgar_response.url)

https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000050863&type=8-k&owner=exclude&count=100&output=atom&start=


In [73]:
# Parsing initial response to build list of document urls and filing dates

soup = BeautifulSoup(edgar_response.text, 'lxml')

all_docs = []
# While the link to the next page existing is true
while True:
    # Find all document entries on the page
    entries = soup.find_all('entry')
    # For each entry scrape the filing-date and the url to the text document
    for entry in entries:
        filing_date = entry.find('filing-date').text
        doc_link = re.sub('-index.htm.*', '.txt', entry.find('link')['href'])
        doc_entry = (filing_date, doc_link)
        all_docs.append(doc_entry)
    # Break loop after scraping entries on the current page, but before requesting on the link to the next page which is potentially none existant
    if soup.find_all('link', {'rel': 'next'}) == []:
        break
    # Find link to the next page, request next page, and update soup object to consist of the next page
    nxt_pg_link = soup.find_all('link', {'rel': 'next'})[0]['href']
    nxt_pg = requests.get(nxt_pg_link)
    soup = BeautifulSoup(nxt_pg.text, 'lxml')

In [74]:
doc_df = pd.DataFrame(all_docs, columns=['filing_date', 'url'])
doc_df['filing_date'] = pd.to_datetime(doc_df['filing_date'])
doc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 2 columns):
filing_date    335 non-null datetime64[ns]
url            335 non-null object
dtypes: datetime64[ns](1), object(1)
memory usage: 5.4+ KB


In [75]:
doc_df.head()

Unnamed: 0,filing_date,url
0,2019-09-13,https://www.sec.gov/Archives/edgar/data/50863/...
1,2019-07-25,https://www.sec.gov/Archives/edgar/data/50863/...
2,2019-05-22,https://www.sec.gov/Archives/edgar/data/50863/...
3,2019-05-09,https://www.sec.gov/Archives/edgar/data/50863/...
4,2019-04-25,https://www.sec.gov/Archives/edgar/data/50863/...


## Combining text data with stock price data

### Converting price data to log yields

In [84]:
price_df['daily_return'] = np.log(price_df['adjusted_close'] / price_df['adjusted_close'].shift(1))
price_df['weekly_return'] = np.log(price_df['adjusted_close'] / price_df['adjusted_close'].shift(5))
price_df.head(13)

Unnamed: 0,timestamp,open,high,low,close,adjusted_close,volume,dividend_amount,split_coefficient,daily_return,weekly_return
0,2019-09-18,52.13,52.1,51.162,51.74,51.74,13034209,0.0,1.0,,
1,2019-09-17,52.05,52.14,51.35,51.95,51.95,19641100,0.0,1.0,0.004051,
2,2019-09-16,51.9,52.29,51.7,52.2,52.2,13354600,0.0,1.0,0.004801,
3,2019-09-13,52.76,53.0,52.23,52.54,52.54,18010800,0.0,1.0,0.006492,
4,2019-09-12,53.0,53.33,52.07,53.01,53.01,23308700,0.0,1.0,0.008906,
5,2019-09-11,51.6,52.79,51.38,52.79,52.79,18968900,0.0,1.0,-0.004159,0.020091
6,2019-09-10,51.33,51.84,50.83,51.82,51.82,18532000,0.0,1.0,-0.018546,-0.002506
7,2019-09-09,51.06,52.03,51.02,51.59,51.59,20749700,0.0,1.0,-0.004448,-0.011755
8,2019-09-06,50.25,50.99,50.07,50.92,50.92,17067900,0.0,1.0,-0.013072,-0.031319
9,2019-09-05,49.93,51.17,49.87,50.1,50.1,30014200,0.0,1.0,-0.016235,-0.05646


### Reshaping dataframe

In [77]:
# (String ticker, Foo foo) ----> {'Documents'}
def fetch_data(foo, foo):
    pass
    return None

SyntaxError: duplicate argument 'foo' in function definition (<ipython-input-77-e72b0c1a4cb5>, line 5)

In [None]:
## Extrainous but maybe usefull code
"""
# EDGAR url and params
edgar_url = 'https://www.sec.gov/Archives/edgar/full-index/'
edgar_start_yr = start_date.year
edgar_start_qtr = ((start_date.month - 1) // (12 // 4)) + 1
edgar_current_yr = datetime.datetime.now().year
edgar_current_qtr = ((datetime.datetime.now().month - 1) // (12 // 4)) + 1

for yr in range(edgar_start_yr, edgar_current_yr + 1):
    for qtr in range(1, 5):
        if not (((yr == edgar_start_yr) and (qtr < edgar_start_qtr))
                or ((yr == edgar_current_yr) and (qtr > edgar_current_qtr))):
            url = edgar_url + str(yr) + '/QTR' + str(qtr) + '/master.idx'


"""