# ...So
We now have a dictionary containing dataframes of FDA action dates and stock price time series. Lets open it up from dill and begin cutting out our feature space.

In [2]:
import dill
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [3]:
company_dataframes = dill.load(open('Prices_and_FDA_Dates.pkl', 'r'))
company_list = company_dataframes.keys()

In [4]:
len(company_list)

186

Lets also open the index price and normalize our prices to that

In [5]:
closing_index = dill.load(open("close_price_series.pkl"))

In [6]:
closing_index

2000-01-03    27.105697
2000-01-04    25.714626
2000-01-05    25.839977
2000-01-06    26.257429
2000-01-07    28.418530
2000-01-10    29.682323
2000-01-11    29.124288
2000-01-12    28.900803
2000-01-13    29.900409
2000-01-14    30.369091
2000-01-18    30.573311
2000-01-19    31.521000
2000-01-20    31.686902
2000-01-21    32.574053
2000-01-24    31.603583
2000-01-25    31.171758
2000-01-26    31.573591
2000-01-27    30.780606
2000-01-28    29.986258
2000-01-31    29.302308
2000-02-01    29.794932
2000-02-02    30.406386
2000-02-03    31.289833
2000-02-04    31.859924
2000-02-07    32.929098
2000-02-08    34.547371
2000-02-09    34.626833
2000-02-10    35.371856
2000-02-11    35.461826
2000-02-14    35.772818
                ...    
2017-08-25    34.822801
2017-08-28    35.664894
2017-08-29    35.663518
2017-08-30    35.989440
2017-08-31    36.788699
2017-09-01    36.804418
2017-09-05    36.731590
2017-09-06    36.976773
2017-09-07    36.967663
2017-09-08    36.660766
2017-09-11    36

In [4]:
company_dataframes['NEW'].loc[company_dataframes['NEW']['pdufa?'] == True]

Unnamed: 0,volume,close,high,open,low,pdufa?
2006-10-06,0.0,11.6,12.8,12.8,11.6,True


In [5]:
testdf = company_dataframes['NEW']

In [6]:
testdf.reset_index(inplace = True)

In [7]:
testdf.loc[testdf['pdufa?']]

Unnamed: 0,index,volume,close,high,open,low,pdufa?
1699,2006-10-06,0.0,11.6,12.8,12.8,11.6,True


In [8]:
testdf.loc[1699]

index     2006-10-06
volume             0
close           11.6
high            12.8
open            12.8
low             11.6
pdufa?          True
Name: 1699, dtype: object

In [9]:
testdf.loc[1699]['close']

11.6

In [10]:
testind = testdf.index[testdf['pdufa?'] == True]

In [11]:
testind

Int64Index([1699], dtype='int64')

In [12]:
testind[0]

1699

So general idea:
1. iterate through dictionary of dataframes
1. in each dataframe, find each row with a `pdufa`
1. collect the (preceding and following) 120 rows of close prices and volumes
1. return those as an array of close price and volume vectors with the company name and pdufa date as metadata
1. something to the effect of `("Ticker", pdufaDate, (120 preceding close prices and volumes), (120 following close prices and volumes))`

In [12]:
company_dataframes['ABBV'].join(closing_index, how='left')

Unnamed: 0,volume,close,high,open,low,pdufa?,Close_Price_Index
2013-01-02,13767900.0,35.1200,35.400,34.92,34.100,False,20.154024
2013-01-03,16739300.0,34.8300,35.000,35.00,34.160,False,20.209573
2013-01-04,21372100.0,34.3908,34.890,34.62,34.250,False,20.326711
2013-01-07,17897100.0,34.4600,35.450,34.15,34.150,False,20.410941
2013-01-08,17863300.0,33.7100,34.640,34.29,33.360,False,20.510795
2013-01-09,18800400.0,33.9000,33.950,33.61,33.610,False,20.738938
2013-01-10,15658100.0,34.0000,34.000,33.66,33.330,False,20.862919
2013-01-11,11191500.0,33.8500,33.900,33.59,33.350,False,20.805752
2013-01-14,11584900.0,34.0900,34.250,34.01,33.800,False,20.785771
2013-01-15,13040200.0,34.6000,34.720,33.71,33.710,False,20.861152


In [13]:
data = []
for company in tqdm_notebook(company_list):
    df = company_dataframes[company].reset_index()
    pdufa_dates = df.index[df['pdufa?']].tolist()
    if len(pdufa_dates) > 0:
        for date in pdufa_dates:
            pRange = range(date-120, date)
            fRange = range(date, date+121)
            pCloses, pVolumes, fCloses, fVolumes = [], [], [], []
            for i in pRange:
                try:
                    pCloses.append(df.loc[i]['close'])
                    pVolumes.append(df.loc[i]['volume'])
                except:
                    pCloses.append(None)
                    pVolumes.append(None)
            for i in fRange:
                try:
                    fCloses.append(df.loc[i]['close'])
                    fVolumes.append(df.loc[i]['volume'])
                except:
                    fCloses.append(None)
                    fVolumes.append(None)
            data.append((company, df.loc[date]['index'], (pCloses, pVolumes), (fCloses, fVolumes)))




In [15]:
dill.dump(data, open('stock_price_training_slices.pkl', 'w'))

So theres our data points, stored as slices of the stock price/volume histories 120 days prior to an FDA event. `268*120*2 = 64320` data points in total. Time for some signal processing.

I know this could be done far more elegantly, but I need an adequate solution _yesterday_, not a perfect one next week.

# THE RETURN

Now I've made a closing price index, so lets normalize the prices to that. I'm also shortening the slices from [120 days prior to day of], to [120 days prior to 7 days prior] for the eventual web facing app. 

Obviously we no longer need following slices for plots, as the data has been annotated. 

In [15]:
df

Unnamed: 0,index,volume,close,high,open,low,pdufa?,Close_Price_Index
0,2008-11-19,3000.0,1.55,1.55,1.55,1.30,False,10.555484
1,2008-11-20,0.0,1.55,1.55,1.55,1.55,False,9.951893
2,2008-11-21,0.0,1.55,1.55,1.55,1.55,False,9.954019
3,2008-11-24,0.0,1.55,1.55,1.55,1.55,False,10.335679
4,2008-11-25,0.0,1.55,1.55,1.55,1.55,False,10.455493
5,2008-11-26,0.0,1.55,1.55,1.55,1.55,False,10.741020
6,2008-11-28,0.0,1.55,1.55,1.55,1.55,False,10.937216
7,2008-12-01,0.0,1.55,1.55,1.55,1.55,False,10.094567
8,2008-12-02,1000.0,1.75,1.75,1.75,1.75,False,10.517216
9,2008-12-03,0.0,1.75,1.75,1.75,1.75,False,10.740484


In [19]:
norm_data = []
for company in tqdm_notebook(company_list):
    df = company_dataframes[company].join(closing_index, how='left').reset_index()
    pdufa_dates = df.index[df['pdufa?']].tolist()
    if len(pdufa_dates) > 0:
        for date in pdufa_dates:
            pRange = range(date-120, date-7)
            pCloses, pVolumes = [], []
            for i in pRange:
                try:
                    pCloses.append(df.loc[i]['close']/df.loc[i]['Close_Price_Index'])
                    pVolumes.append(df.loc[i]['volume'])
                except:
                    pCloses.append(None)
                    pVolumes.append(None)
            norm_data.append((company, df.loc[date]['index'], (pCloses, pVolumes)))




In [21]:
[datum[2][0] for datum in norm_data]

[[0.7933874515206879,
  0.80414582271180723,
  0.79544506946215876,
  0.81811826478340077,
  0.83147766328755968,
  0.79881793273254265,
  0.77638174762663092,
  0.79818955029166427,
  0.86190859775345907,
  0.80798535806435734,
  0.80073457741944587,
  0.79977374169166449,
  0.79053887328307237,
  0.78693298837640702,
  0.7893359612890084,
  0.7936043362414702,
  0.84240135405082217,
  0.86033682082671825,
  0.86246373617594174,
  0.84759572197485988,
  0.88657506716664314,
  0.84604915190089713,
  0.85002229490363823,
  0.76934908587972461,
  0.84244542596052296,
  0.78489856813437853,
  0.81225248319883447,
  0.8545933614703296,
  0.86873036368586865,
  0.8752046211904021,
  0.88503842919582976,
  0.79754592141032932,
  0.81500445740611061,
  0.84000951474892127,
  0.83546453600648496,
  0.84444227601684219,
  0.85335410591674243,
  0.86443524292537599,
  0.87481875535556319,
  0.90979035870965097,
  0.91550158744194976,
  0.91322442032059681,
  0.9013445243132836,
  0.9348724502634

In [23]:
dill.dump(norm_data, open('normalized_stock_price_slices.pkl', 'w'))