In [1]:
import dill
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
%matplotlib inline

# Let's turn those time series into something usable!

To start, let's un-serialize our normalized and annotated training data slices

In [2]:
data = dill.load(open('normalized_stock_price_slices.pkl', 'r'))

I'm going to start feature extraction with a library I found called [tsfresh](http://tsfresh.readthedocs.io/en/latest/index.html). There were a handful of libraries avalible, but this one appears to have the most permissive licensure, and the simplest usage.

tsfresh expects time series data in a pandas dataframe, so lets convert these vectors into one big dataframe with all the required formatting

In [3]:
def assemble_frame(datum):
    df = pd.DataFrame(datum[3][0], columns=['date','norm_price'])
    df['event'] = datum[0]+"/"+datum[1]
    df['outcome'] = int(datum[2])
    return df

In [4]:
first = True

for line in tqdm_notebook(data):
    try:
        if first:
            agg_data = assemble_frame(line)
            first = False
        else:
            tmp_data = assemble_frame(line)
            agg_data = pd.concat([agg_data, tmp_data],ignore_index=True)
    except:
        print line[0], line[1], "failed"

COLL 2015-10-12 failed
NEOS 2015-11-09 failed



In [5]:
agg_data['date_stamp'] = pd.to_datetime(agg_data['date'])

In [6]:
event_labels = pd.factorize(agg_data['event'])

In [7]:
agg_data["event_stamp"] = event_labels[0]

Now we have one long dataframe of labeled price slices (in a tsfresh ready format), lets examine it. 

In [8]:
agg_data.head(2)

Unnamed: 0,date,norm_price,event,outcome,date_stamp,event_stamp
0,2015-12-08,-0.121747,AAAP/2016-06-01,1,2015-12-08,0
1,2015-12-09,-0.116498,AAAP/2016-06-01,1,2015-12-09,0


Lets strike all the rows with Null prices

In [9]:
agg_data['null'] = pd.isnull(agg_data).apply(lambda x: sum(x) , axis=1)

In [10]:
agg_data

Unnamed: 0,date,norm_price,event,outcome,date_stamp,event_stamp,null
0,2015-12-08,-0.121747,AAAP/2016-06-01,1,2015-12-08,0,0
1,2015-12-09,-0.116498,AAAP/2016-06-01,1,2015-12-09,0,0
2,2015-12-10,-0.122027,AAAP/2016-06-01,1,2015-12-10,0,0
3,2015-12-11,-0.107012,AAAP/2016-06-01,1,2015-12-11,0,0
4,2015-12-14,-0.098576,AAAP/2016-06-01,1,2015-12-14,0,0
5,2015-12-15,-0.119215,AAAP/2016-06-01,1,2015-12-15,0,0
6,2015-12-16,-0.132142,AAAP/2016-06-01,1,2015-12-16,0,0
7,2015-12-17,-0.117975,AAAP/2016-06-01,1,2015-12-17,0,0
8,2015-12-18,-0.082288,AAAP/2016-06-01,1,2015-12-18,0,0
9,2015-12-21,-0.114307,AAAP/2016-06-01,1,2015-12-21,0,0


In [11]:
agg_data['null'] = pd.isnull(agg_data).apply(lambda x: sum(x) , axis=1)

In [12]:
cleaned_agg = agg_data[agg_data['null'] == 0]

In [13]:
cleaned_agg

Unnamed: 0,date,norm_price,event,outcome,date_stamp,event_stamp,null
0,2015-12-08,-0.121747,AAAP/2016-06-01,1,2015-12-08,0,0
1,2015-12-09,-0.116498,AAAP/2016-06-01,1,2015-12-09,0,0
2,2015-12-10,-0.122027,AAAP/2016-06-01,1,2015-12-10,0,0
3,2015-12-11,-0.107012,AAAP/2016-06-01,1,2015-12-11,0,0
4,2015-12-14,-0.098576,AAAP/2016-06-01,1,2015-12-14,0,0
5,2015-12-15,-0.119215,AAAP/2016-06-01,1,2015-12-15,0,0
6,2015-12-16,-0.132142,AAAP/2016-06-01,1,2015-12-16,0,0
7,2015-12-17,-0.117975,AAAP/2016-06-01,1,2015-12-17,0,0
8,2015-12-18,-0.082288,AAAP/2016-06-01,1,2015-12-18,0,0
9,2015-12-21,-0.114307,AAAP/2016-06-01,1,2015-12-21,0,0


Now, extracting features is going to be memory itensive. So lets start with a new notebook and a fresh kernel (shutting down everything else to save RAM). 

In [14]:
dill.dump(cleaned_agg, open("unified_and_stamped_dataframe.pkl", "w"))

Lets also make a smaller test/train split to extract features from. 

In [15]:
from sklearn.cross_validation import train_test_split



In [27]:
train_data, test_data = train_test_split(data, train_size = .8)

In [None]:
first = True

for line in tqdm_notebook(train_data):
    try:
        if first:
            train_df = assemble_frame(line)
            first = False
        else:
            tmp_df = assemble_frame(line)
            train_df = pd.concat([train_df, tmp_df],ignore_index=True)
    except:
        print line[0], line[1], "failed"

In [26]:
train_df['date_stamp'] = pd.to_datetime(train_df['date'])
event_labels = pd.factorize(train_df['event'])
train_df["event_stamp"] = event_labels[0]

train_df['null'] = pd.isnull(train_df).apply(lambda x: sum(x) , axis=1)
train_clean = train_df[train_df['null'] == 0]

In [19]:
first = True

for line in tqdm_notebook(test_data):
    try:
        if first:
            test_df = assemble_frame(line)
            first = False
        else:
            tmp_df = assemble_frame(line)
            test_df = pd.concat([test_df, tmp_df],ignore_index=True)
    except:
        print line[0], line[1], "failed"
test_df['date_stamp'] = pd.to_datetime(test_df['date'])
event_labels = pd.factorize(test_df['event'])
test_df["event_stamp"] = event_labels[0]

test_df['null'] = pd.isnull(test_df).apply(lambda x: sum(x) , axis=1)
test_clean = test_df[test_df['null'] == 0]

NEOS 2015-11-09 failed



Now we've got two halves of a dataframe. Let's serialize those for a model.

In [20]:
dill.dump(train_clean, open("train_df.pkl", "w"))
dill.dump(test_clean, open("test_df.pkl", "w"))