In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm_notebook
import dill
from tqdm import tqdm_notebook
from alpha_vantage.timeseries import TimeSeries

After attempting to do things _correctly_ with SQL, I realized I would burn a lot of time teaching myself how to process relatively little data. So I've decided to collect all my data gathering into one notebook, and use Pandas instead of SQLite, because I can stand that up _adequately and quickly_.

First, lets get the historical PDUFA Dates

In [2]:
from urllib2 import urlopen
import ics
import re
tickerRe = re.compile(r"\A[A-Z]{3,4}\W")
today = datetime.today()

In [3]:
FdaUrl = "https://calendar.google.com/calendar/ical/5dso8589486irtj53sdkr4h6ek%40group.calendar.google.com/public/basic.ics"

In [4]:
FdaCal = ics.Calendar(urlopen(FdaUrl).read().decode('iso-8859-1'))

In [5]:
FdaCal

<Calendar with 544 events>

In [7]:
past_pdufa_syms = set()
for event in tqdm_notebook(FdaCal.events):
    matches = re.findall(tickerRe, event.name)
    if len(matches) >=1:
        eComp = str(matches[0]).strip().strip(".")
        past_pdufa_syms.add(eComp)




Thats all the ticker symbols in the past PDUFA list. Lets run the Alpha vantage API.

In [8]:
av_key_handle = open("alphavantage.apikey", "r")
ts = TimeSeries(key=av_key_handle.read().strip(), output_format='pandas')
av_key_handle.close()

In [9]:
dataframes = dict()

In [10]:
fails = set()
wins = set()
for ticker in tqdm_notebook(past_pdufa_syms):
    try:
        df, meta = ts.get_daily(symbol=ticker, outputsize='full')
        dataframes[meta["2. Symbol"]] = df
    except:
        fails.add(ticker)
    else:
        wins.add(meta["2. Symbol"])




Now we'll run through our past FDA dates and join the FDA actions to each dataframe

In [13]:
companies = dataframes.keys()

In [18]:
price_and_fda = dict()
for company in tqdm_notebook(companies):
    company_events = []
    for event in FdaCal.events:
        matches = re.findall(tickerRe, event.name)
        if len(matches)>=1:
            if company in matches[0]:
                #print company, event.name, event.begin
                company_events.append((event.begin.datetime.strftime("%Y-%m-%d"), "pdufa"))
    price = dataframes[company]
    raw_dates = pd.DataFrame(company_events, columns = ["date", "event"])
    dates = raw_dates.set_index("date")
    #print dates
    #print price
    final = price.join(dates,rsuffix='_y')
    price_and_fda[company] = final
    
                




So I know this code is seriously inelegant. However since this is just for gathering training data (and I'm far more competent with pandas than SQL) I decided that Programmer time is more needed than computer time in this case.

In [20]:
price_and_fda['MRK']

Unnamed: 0,volume,close,high,open,low,event
2000-01-03,5978800.0,67.63,68.63,68.63,67.50,
2000-01-04,7533100.0,65.25,67.81,67.00,64.75,
2000-01-05,7598300.0,68.25,68.88,65.06,65.06,
2000-01-06,4760500.0,68.38,69.19,67.81,67.69,
2000-01-07,10373300.0,74.94,75.25,69.81,69.63,
2000-01-10,5726800.0,72.75,74.81,74.75,72.13,
2000-01-11,5023300.0,72.81,73.69,72.06,71.38,
2000-01-12,4990400.0,74.44,75.19,73.63,73.44,
2000-01-13,5353500.0,74.75,76.13,74.81,73.81,
2000-01-14,5540200.0,74.13,75.94,75.25,72.69,


In [21]:
dill.dump(price_and_fda, open("Prices_and_FDA_Dates.pkl", "w"))