#  Stock Market Dataset

ref: https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs

This is the data we will use to train the models that will be used by our Dash web app.

In [58]:
import os
import random

import pandas as pd
import numpy as np

import datetime
import time

import plotly
import plotly.plotly as py
import plotly.graph_objs as go

## EDA

Load 5 random files.

In [2]:
os.chdir('data/Stocks/')

filenames = random.sample([x for x in os.listdir() if x.endswith('.txt') 
                           and os.path.getsize(os.path.join('',x)) > 0], 8)
print(filenames)

['vnce.us.txt', 'bt.us.txt', 'mhh.us.txt', 'wrls.us.txt', 'botj.us.txt', 'rnlc.us.txt', 'djd.us.txt', 'ibml.us.txt']


In [4]:
df = []
for i in range(len(filenames)):
    df.append(pd.read_csv(filenames[i], sep=',')[['Date', 'Close']])

In [5]:
df[0]

Unnamed: 0,Date,Close
0,2013-11-22,286.6000
1,2013-11-25,277.5000
2,2013-11-26,290.0000
3,2013-11-27,297.0000
4,2013-11-29,292.7000
5,2013-12-02,291.8000
6,2013-12-03,293.4000
7,2013-12-04,291.5000
8,2013-12-05,293.8000
9,2013-12-06,291.0000


The data seems to be incomplete.

In [33]:
df[0]['Date'] = pd.DatetimeIndex(data=df[0]['Date'])
dates = df[0]['Date']

DAY = datetime.timedelta(days=1)
missing = [(d1+DAY, d2) for d1, d2 in zip(dates, dates[1:]) if (d2 - d1) > DAY]

missing

[(Timestamp('2013-11-23 00:00:00'), Timestamp('2013-11-25 00:00:00')),
 (Timestamp('2013-11-28 00:00:00'), Timestamp('2013-11-29 00:00:00')),
 (Timestamp('2013-11-30 00:00:00'), Timestamp('2013-12-02 00:00:00')),
 (Timestamp('2013-12-07 00:00:00'), Timestamp('2013-12-09 00:00:00')),
 (Timestamp('2013-12-14 00:00:00'), Timestamp('2013-12-16 00:00:00')),
 (Timestamp('2013-12-21 00:00:00'), Timestamp('2013-12-23 00:00:00')),
 (Timestamp('2013-12-25 00:00:00'), Timestamp('2013-12-26 00:00:00')),
 (Timestamp('2013-12-28 00:00:00'), Timestamp('2013-12-30 00:00:00')),
 (Timestamp('2014-01-01 00:00:00'), Timestamp('2014-01-02 00:00:00')),
 (Timestamp('2014-01-04 00:00:00'), Timestamp('2014-01-06 00:00:00')),
 (Timestamp('2014-01-11 00:00:00'), Timestamp('2014-01-13 00:00:00')),
 (Timestamp('2014-01-18 00:00:00'), Timestamp('2014-01-21 00:00:00')),
 (Timestamp('2014-01-25 00:00:00'), Timestamp('2014-01-27 00:00:00')),
 (Timestamp('2014-02-01 00:00:00'), Timestamp('2014-02-03 00:00:00')),
 (Time

Most of the missing days are holiday and week-ends: they occur every 5 days and have a span of 2 days. We should filter the company stock data that we want to use to train our models.

In [55]:
def has_nan(df):
    return df.isnull().values.any()

In [51]:
def has_few_data(df):
    if(df.shape[0] < 2000):
        return True
    
    return False

In [59]:
def use_company_data(df):
    if(has_nan(df)):
        return False
    elif(has_few_data(df)):
        return False
    else:
        return True

start = time.time()
for i in range(len(df)):
    use_company_data(df[i])
end = time.time()
print(end-start)

0.003994464874267578


In [66]:
files = []
for file in os.listdir():
    try:
        df = pd.read_csv(file, sep=',')
        if(use_company_data( df )):
            files.append(file)
    except:
        continue

In [68]:
len(files)

3429

## Data Vizualization

In [70]:
df = []
for i in range(5):
    df.append(pd.read_csv(files[i], sep=',')[['Date', 'Close']])

In [71]:
plotly.tools.set_credentials_file(username='Fournierp', api_key='oTCE0D66pVKwpN2e6JQn')

trace_one = go.Scatter(
    x = df[0]['Date'],
    y = df[0]['Close'],
    name = filenames[0][-11:-7] + " Close",
    line = dict(color = '#ea0e0e'),
    opacity = 0.8)

trace_two = go.Scatter(
    x = df[1]['Date'],
    y = df[1]['Close'],
    name = filenames[1][-11:-7] + " Close",
    line = dict(color = '#000000'),
    opacity = 0.8)

trace_three = go.Scatter(
    x = df[2]['Date'],
    y = df[2]['Close'],
    name = filenames[2][-11:-7] + " Close",
    line = dict(color = '#26fc05'),
    opacity = 0.8)

trace_four = go.Scatter(
    x = df[3]['Date'],
    y = df[3]['Close'],
    name = filenames[3][-11:-7] + " Close",
    line = dict(color = '#0542fc'),
    opacity = 0.8)

trace_five = go.Scatter(
    x = df[4]['Date'],
    y = df[4]['Close'],
    name = filenames[4][-11:-7] + " Close",
    line = dict(color = '#fc05f8'),
    opacity = 0.8)

data = [trace_one, trace_two, trace_three, trace_four]

layout = dict(
    title='Time Series',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label='1m',
                     step='month',
                     stepmode='backward'),
                dict(count=6,
                     label='6m',
                     step='month',
                     stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(
            visible = True
        ),
        type='date'
    )
)

fig = dict(data=data, layout=layout)
py.iplot(fig)


Consider using IPython.display.IFrame instead

