# ...So
We now have a dictionary containing dataframes of FDA action dates and stock price time series. Lets open it up from dill and begin cutting out our feature space.

In [25]:
import dill
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [26]:
company_dataframes = dill.load(open('Prices_and_FDA_Dates.pkl', 'r'))
company_list = company_dataframes.keys()

In [27]:
len(company_list)

155

Lets also open the index price and normalize our prices to that

In [28]:
closing_index = dill.load(open("close_price_stats_frame.pkl", "r"))

In [29]:
closing_index

Unnamed: 0,CP_mean,CP_stdv
2000-01-03,28.520755,29.266429
2000-01-04,27.095726,28.010693
2000-01-05,27.200238,28.108746
2000-01-06,27.628152,28.330073
2000-01-07,29.792918,31.265757
2000-01-10,31.145792,32.584798
2000-01-11,30.572311,31.646526
2000-01-12,30.334148,31.266784
2000-01-13,31.466492,33.578784
2000-01-14,31.957738,33.724492


In [30]:
company_dataframes['NEW'].loc[company_dataframes['NEW']['pdufa?'] == True]

Unnamed: 0,volume,close,high,open,low,pdufa?
2006-10-06,0.0,11.6,12.8,12.8,11.6,True


In [31]:
testdf = company_dataframes['NEW']

In [32]:
testdf.reset_index(inplace = True)

In [33]:
testdf.loc[testdf['pdufa?']]

Unnamed: 0,index,volume,close,high,open,low,pdufa?
1699,2006-10-06,0.0,11.6,12.8,12.8,11.6,True


In [34]:
testdf.loc[1699]

index     2006-10-06
volume             0
close           11.6
high            12.8
open            12.8
low             11.6
pdufa?          True
Name: 1699, dtype: object

In [35]:
testdf.loc[1699]['close']

11.6

In [36]:
testind = testdf.index[testdf['pdufa?'] == True]

In [37]:
testind

Int64Index([1699], dtype='int64')

In [38]:
testind[0]

1699

So general idea:
1. iterate through dictionary of dataframes
1. in each dataframe, find each row with a `pdufa`
1. collect the (preceding and following) 120 rows of close prices and volumes
1. return those as an array of close price and volume vectors with the company name and pdufa date as metadata
1. something to the effect of `("Ticker", pdufaDate, (120 preceding close prices and volumes), (120 following close prices and volumes))`

In [39]:
company_dataframes['ABBV'].join(closing_index, how='left')

Unnamed: 0,volume,close,high,open,low,pdufa?,CP_mean,CP_stdv
2013-01-02,13767900.0,35.1200,35.400,34.92,34.100,False,21.130602,32.316146
2013-01-03,16739300.0,34.8300,35.000,35.00,34.160,False,21.049670,32.232472
2013-01-04,21372100.0,34.3908,34.890,34.62,34.250,False,21.178519,32.324112
2013-01-07,17897100.0,34.4600,35.450,34.15,34.150,False,21.244621,32.371023
2013-01-08,17863300.0,33.7100,34.640,34.29,33.360,False,21.484443,32.265225
2013-01-09,18800400.0,33.9000,33.950,33.61,33.610,False,21.562688,32.741690
2013-01-10,15658100.0,34.0000,34.000,33.66,33.330,False,21.683117,32.958141
2013-01-11,11191500.0,33.8500,33.900,33.59,33.350,False,21.594980,32.800412
2013-01-14,11584900.0,34.0900,34.250,34.01,33.800,False,21.743537,32.765215
2013-01-15,13040200.0,34.6000,34.720,33.71,33.710,False,21.778615,32.603972


In [40]:
data = []
for company in tqdm_notebook(company_list):
    df = company_dataframes[company].reset_index()
    pdufa_dates = df.index[df['pdufa?']].tolist()
    if len(pdufa_dates) > 0:
        for date in pdufa_dates:
            pRange = range(date-120, date)
            fRange = range(date, date+121)
            pCloses, pVolumes, fCloses, fVolumes = [], [], [], []
            for i in pRange:
                try:
                    pCloses.append(df.loc[i]['close'])
                    pVolumes.append(df.loc[i]['volume'])
                except:
                    pCloses.append(None)
                    pVolumes.append(None)
            for i in fRange:
                try:
                    fCloses.append(df.loc[i]['close'])
                    fVolumes.append(df.loc[i]['volume'])
                except:
                    fCloses.append(None)
                    fVolumes.append(None)
            data.append((company, df.loc[date]['index'], (pCloses, pVolumes), (fCloses, fVolumes)))




In [17]:
dill.dump(data, open('stock_price_training_slices.pkl', 'w'))

So theres our data points, stored as slices of the stock price/volume histories 120 days prior to an FDA event. `268*120*2 = 64320` data points in total. Time for some signal processing.

I know this could be done far more elegantly, but I need an adequate solution _yesterday_, not a perfect one next week.

# Part Two

Now I've made a closing price index, so lets normalize the prices to that. I'm also shortening the slices from [120 days prior to day of], to [120 days prior to 7 days prior] for the eventual web facing app. 

Obviously we no longer need following slices for plots, as the data has been annotated. 

In [18]:
norm_data = []
for company in tqdm_notebook(company_list):
    df = company_dataframes[company].join(closing_index, how='left').reset_index()
    pdufa_dates = df.index[df['pdufa?']].tolist()
    if len(pdufa_dates) > 0:
        for date in pdufa_dates:
            pRange = range(date-120, date-7)
            pCloses, pVolumes = [], []
            for i in pRange:
                try:
                    close_price = df.loc[i]['close']
                    volume = df.loc[i]['volume']
                    mean_price = df.loc[i]['CP_mean']
                    stdv_price = df.loc[i]['CP_stdv']
                    pCloses.append(( df.loc[i]['index'],(close_price-mean_price)/(stdv_price) ))
                    pVolumes.append(( df.loc[i]['index'], volume ))
                except:
                    pCloses.append(None)
                    pVolumes.append(None)
            norm_data.append((company, df.loc[date]['index'], (pCloses, pVolumes)))




In [19]:
norm_data[:2]

[('AAAP',
  '2016-06-01',
  ([('2015-12-08', -0.12174738784081005),
    ('2015-12-09', -0.11649757460863203),
    ('2015-12-10', -0.12202718753675167),
    ('2015-12-11', -0.10701232720700836),
    ('2015-12-14', -0.098575729382038818),
    ('2015-12-15', -0.1192150549485034),
    ('2015-12-16', -0.13214151674067587),
    ('2015-12-17', -0.11797484551305558),
    ('2015-12-18', -0.082288199923163566),
    ('2015-12-21', -0.1143065235110669),
    ('2015-12-22', -0.11929035332875815),
    ('2015-12-23', -0.11892191948170955),
    ('2015-12-24', -0.12568038792527547),
    ('2015-12-28', -0.12710162195805946),
    ('2015-12-29', -0.12395220359326434),
    ('2015-12-30', -0.1227671908381152),
    ('2015-12-31', -0.092712355058081633),
    ('2016-01-04', -0.083297778426778893),
    ('2016-01-05', -0.082662390476770778),
    ('2016-01-06', -0.090411169273986777),
    ('2016-01-07', -0.066076615447791726),
    ('2016-01-08', -0.090436371473833085),
    ('2016-01-11', -0.086162818845528177),
  

That looks normalized to me, lets rejoin the annotations to the data and begin feature extraction. 

In [20]:
scores = [line.split() for line in open("score_sheet_complete.txt", "r").readlines()]

In [21]:
scores[:2]

[['AAAP', '2016-06-01', '1'], ['AAAP', '2016-12-28', '1']]

In [22]:
norm_data_annotated = []
for datum, score in zip(norm_data, scores):
    if datum[0] == score [0] and datum [1] == score[1]:
        norm_data_annotated.append((datum[0], datum[1], 
                                    score[2], datum[2] ))
    else:
        print "whoops theres a mismatch"
        

In [23]:
norm_data_annotated[:2]

[('AAAP',
  '2016-06-01',
  '1',
  ([('2015-12-08', -0.12174738784081005),
    ('2015-12-09', -0.11649757460863203),
    ('2015-12-10', -0.12202718753675167),
    ('2015-12-11', -0.10701232720700836),
    ('2015-12-14', -0.098575729382038818),
    ('2015-12-15', -0.1192150549485034),
    ('2015-12-16', -0.13214151674067587),
    ('2015-12-17', -0.11797484551305558),
    ('2015-12-18', -0.082288199923163566),
    ('2015-12-21', -0.1143065235110669),
    ('2015-12-22', -0.11929035332875815),
    ('2015-12-23', -0.11892191948170955),
    ('2015-12-24', -0.12568038792527547),
    ('2015-12-28', -0.12710162195805946),
    ('2015-12-29', -0.12395220359326434),
    ('2015-12-30', -0.1227671908381152),
    ('2015-12-31', -0.092712355058081633),
    ('2016-01-04', -0.083297778426778893),
    ('2016-01-05', -0.082662390476770778),
    ('2016-01-06', -0.090411169273986777),
    ('2016-01-07', -0.066076615447791726),
    ('2016-01-08', -0.090436371473833085),
    ('2016-01-11', -0.0861628188455281

In [24]:
dill.dump(norm_data_annotated, open('normalized_stock_price_slices.pkl', 'w'))

That looks normalized and serialized to me (now with dates for easy dataframe construction for [tsFresh](https://github.com/blue-yonder/tsfresh). Time to run some peak detection and get creative with feature extraction. 