# Stock Market Data

In this notebook, we will do some Exploratory Data Analysis of Stock Market data ahead of Machine Learning design used for the web app.

In [1]:
import yfinance as yf

import pandas as pd
import numpy as np

import datetime
import time
from math import pi

from bokeh.layouts import column
from bokeh.models import ColumnDataSource, RangeTool, HoverTool
from bokeh.plotting import figure, output_notebook, show

# EDA

In [2]:
# Download the data
company = "TSLA"
df = yf.download(company)
df = df.reset_index()
df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,4.778000,93831500
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,4.766000,85935500
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,4.392000,41094000
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,3.840000,25699000
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,3.222000,34334500
...,...,...,...,...,...,...,...
2610,2020-11-09,439.500000,452.500000,421.000000,421.260010,421.260010,34833000
2611,2020-11-10,420.089996,420.089996,396.029999,410.359985,410.359985,30284200
2612,2020-11-11,416.450012,418.700012,410.579987,417.130005,417.130005,17357700
2613,2020-11-12,415.049988,423.000000,409.519989,411.760010,411.760010,19855100


We can see some dates are missing.

In [3]:
df['Date'] = pd.DatetimeIndex(data=df['Date'])
dates = df['Date']

DAY = datetime.timedelta(days=1)
missing = [(d1+DAY, d2) for d1, d2 in zip(dates, dates[1:]) if (d2 - d1) > DAY]

missing[-10:]

[(Timestamp('2020-09-05 00:00:00'), Timestamp('2020-09-08 00:00:00')),
 (Timestamp('2020-09-12 00:00:00'), Timestamp('2020-09-14 00:00:00')),
 (Timestamp('2020-09-19 00:00:00'), Timestamp('2020-09-21 00:00:00')),
 (Timestamp('2020-09-26 00:00:00'), Timestamp('2020-09-28 00:00:00')),
 (Timestamp('2020-10-03 00:00:00'), Timestamp('2020-10-05 00:00:00')),
 (Timestamp('2020-10-10 00:00:00'), Timestamp('2020-10-12 00:00:00')),
 (Timestamp('2020-10-17 00:00:00'), Timestamp('2020-10-19 00:00:00')),
 (Timestamp('2020-10-24 00:00:00'), Timestamp('2020-10-26 00:00:00')),
 (Timestamp('2020-10-31 00:00:00'), Timestamp('2020-11-02 00:00:00')),
 (Timestamp('2020-11-07 00:00:00'), Timestamp('2020-11-09 00:00:00'))]

Most of the missing days are holiday and week-ends: they occur every 5 days and have a span of 2 days. That is when the stock market closes.

In [4]:
dff = df[-50:]
dff["Date"] = pd.to_datetime(dff["Date"])

inc = dff.Close > dff.Open
dec = dff.Open > dff.Close
w = 12*60*60*1000 # half day in ms

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=900,
           title = "Candlestick graph for Stock \""+company+"\"")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

inc_source = ColumnDataSource(data=dict(
    open1=dff.Open[inc],
    close1=dff.Close[inc],
    Date1=dff.Date[inc]
))

dec_source = ColumnDataSource(data=dict(
    open2=dff.Open[dec],
    close2=dff.Close[dec],
    Date2=dff.Date[dec]
))

p.segment(dff.Date, dff.High, dff.Date, dff.Low, color="black")
r1 = p.vbar(x='Date1', width=w, top='open1', bottom='close1', source=inc_source,
                fill_color="#D5E1DD", line_color="black")
r2 = p.vbar(x='Date2', width=w, top='open2', bottom='close2', source=dec_source,
                fill_color="#F2583E", line_color="black")

# Set up the hover tooltip to display some useful data
p.add_tools(HoverTool(
    renderers=[r1],
    tooltips=[
        ("Open", "$@open1"),
        ("Close", "$@close1"),
        ("Date", "@Date1{%F}"),
    ],
    formatters={
        '@Date1': 'datetime',
    }))

p.add_tools(HoverTool(
    renderers=[r2],
    tooltips=[
        ("Open", "$@open2"),
        ("Close", "$@close2"),
        ("Date", "@Date2{%F}")
    ],
    formatters={
        '@Date2': 'datetime'
    }))

output_notebook()

show(p)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff["Date"] = pd.to_datetime(dff["Date"])


In [5]:
dff = df
dff["Date"] = pd.to_datetime(dff["Date"])
dates = dff['Date']
source = ColumnDataSource(data=dict(date=dates, close=dff['Close']))

p = figure(plot_height=300, plot_width=800, tools="xpan", toolbar_location=None,
           x_axis_type="datetime", x_axis_location="above",
           title = "Stock Market closing price graph for \""+company+"\"",
           background_fill_color="#efefef", x_range=(dates[dates.index[-100]], dates[dates.index[-1]]))

p.line('date', 'close', source=source)
p.yaxis.axis_label = 'Price'

hover_tool = HoverTool(
    tooltips=[
        ( 'date',   '@date{%F}'            ),
        ( 'close',  '$@{close}{%0.2f}' ), # use @{ } for field names with spaces
    ],

    formatters={
        '@date'        : 'datetime', # use 'datetime' formatter for '@date' field
        '@{close}' : 'printf',   # use 'printf' formatter for '@{adj close}' field
                                     # use default 'numeral' formatter for other fields
    },

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline'
)
p.add_tools(hover_tool)

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

select = figure(title="Drag the middle and edges of the selection box to change the range above",
                plot_height=130, plot_width=800, y_range=p.y_range, tools=TOOLS, 
                x_axis_type="datetime", y_axis_type=None
                , toolbar_location=None, background_fill_color="#efefef")

range_tool = RangeTool(x_range=p.x_range)
range_tool.overlay.fill_color = "navy"
range_tool.overlay.fill_alpha = 0.2

select.line('date', 'close', source=source)
select.ygrid.grid_line_color = None
select.add_tools(range_tool, hover_tool)
select.toolbar.active_multi = range_tool

output_notebook()
show(column(p, select))