In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm_notebook
import dill
from tqdm import tqdm_notebook
from alpha_vantage.timeseries import TimeSeries

After attempting to do things _correctly_ with SQL, I realized I would burn a lot of time teaching myself how to process relatively little data. So I've decided to collect all my data gathering into one notebook, and use Pandas instead of SQLite, because I can stand that up _adequately and quickly_.

First, lets get the historical PDUFA Dates

In [3]:
from urllib2 import urlopen
import ics
import re
tickerRe = re.compile(r"\A[A-Z]{3,4}\W")
today = datetime.today()

In [4]:
FdaUrl = "https://calendar.google.com/calendar/ical/5dso8589486irtj53sdkr4h6ek%40group.calendar.google.com/public/basic.ics"

In [5]:
FdaCal = ics.Calendar(urlopen(FdaUrl).read().decode('iso-8859-1'))

In [6]:
FdaCal

<Calendar with 551 events>

In [7]:
past_pdufa_syms = set()
for event in tqdm_notebook(FdaCal.events):
    matches = re.findall(tickerRe, event.name)
    if len(matches) >=1:
        eComp = str(matches[0]).strip().strip(".")
        past_pdufa_syms.add(eComp)




Thats all the ticker symbols in the past PDUFA list. Lets run the Alpha vantage API.

In [8]:
av_key_handle = open("alphavantage.apikey", "r")
ts = TimeSeries(key=av_key_handle.read().strip(), output_format='pandas')
av_key_handle.close()

In [9]:
dataframes = dict()

In [10]:
fails = set()
wins = set()
for ticker in tqdm_notebook(past_pdufa_syms):
    try:
        df, meta = ts.get_daily(symbol=ticker, outputsize='full')
        dataframes[meta["2. Symbol"]] = df
    except:
        fails.add(ticker)
    else:
        wins.add(meta["2. Symbol"])




In [11]:
print len(fails), len(wins)

52 155


Now we'll run through our past FDA dates and join the FDA actions to each dataframe

In [12]:
companies = dataframes.keys()

In [13]:
price_and_fda = dict()
for company in tqdm_notebook(companies):
    company_events = []
    for event in FdaCal.events:
        matches = re.findall(tickerRe, event.name)
        if len(matches)>=1:
            if company in matches[0]:
                #print company, event.name, event.begin
                company_events.append((event.begin.datetime.strftime("%Y-%m-%d"), True))
    price = dataframes[company]
    raw_dates = pd.DataFrame(company_events, columns = ["date", "pdufa?"])
    dates = raw_dates.set_index("date")
    #print dates
    #print price
    final = price.join(dates,rsuffix='_y')
    final['pdufa?'].fillna(value=False, inplace = True)
    price_and_fda[company] = final
    
                




So I know this code is seriously inelegant. However since this is just for gathering training data (and I'm far more competent with pandas than SQL) I decided that Programmer time is more needed than computer time in this case.

In [14]:
price_and_fda['MRK']['pdufa?']

2000-01-03    False
2000-01-04    False
2000-01-05    False
2000-01-06    False
2000-01-07    False
2000-01-10    False
2000-01-11    False
2000-01-12    False
2000-01-13    False
2000-01-14    False
2000-01-18    False
2000-01-19    False
2000-01-20    False
2000-01-21    False
2000-01-24    False
2000-01-25    False
2000-01-26    False
2000-01-27    False
2000-01-28    False
2000-01-31    False
2000-02-01    False
2000-02-02    False
2000-02-03    False
2000-02-04    False
2000-02-07    False
2000-02-08    False
2000-02-09    False
2000-02-10    False
2000-02-11    False
2000-02-14    False
              ...  
2017-09-25    False
2017-09-26    False
2017-09-27    False
2017-09-28    False
2017-09-29    False
2017-10-02    False
2017-10-03    False
2017-10-04    False
2017-10-05    False
2017-10-06    False
2017-10-09    False
2017-10-10    False
2017-10-11    False
2017-10-12    False
2017-10-13    False
2017-10-16    False
2017-10-17    False
2017-10-18    False
2017-10-19    False


In [15]:
dill.dump(price_and_fda, open("Prices_and_FDA_Dates.pkl", "w"))