<a href="https://colab.research.google.com/github/Fournierp/alfred/blob/streamlit/models/Exploratory_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stock Market Data

In this notebook, we will do some Exploratory Data Analysis of Stock Market data ahead of Machine Learning design used for the web app.

In [None]:
!pip install yfinance

import yfinance as yf

import pandas as pd
import numpy as np

import datetime
import time
from math import pi

import bokeh
import bokeh.io
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, RangeTool, HoverTool
from bokeh.plotting import figure, output_notebook, show, save
bokeh.io.output_notebook()

Collecting yfinance
  Downloading https://files.pythonhosted.org/packages/7a/e8/b9d7104d3a4bf39924799067592d9e59119fcfc900a425a12e80a3123ec8/yfinance-0.1.55.tar.gz
Collecting lxml>=4.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/bd/78/56a7c88a57d0d14945472535d0df9fb4bbad7d34ede658ec7961635c790e/lxml-4.6.2-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 29.3MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.55-py2.py3-none-any.whl size=22616 sha256=3f5e39661746649eb83e8aed9e9605e51ada8be829e6de956f8f049d642edd1e
  Stored in directory: /root/.cache/pip/wheels/04/98/cc/2702a4242d60bdc14f48b4557c427ded1fe92aedf257d4565c
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfull

# EDA

In [None]:
# Download the data
company = "TSLA"
df = yf.download(company)
df = df.reset_index()
df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,4.778000,93831500
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,4.766000,85935500
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,4.392000,41094000
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,3.840000,25699000
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,3.222000,34334500
...,...,...,...,...,...,...,...
2665,2021-01-29,830.000000,842.409973,780.099976,793.530029,793.530029,34990800
2666,2021-02-01,814.289978,842.000000,795.559998,839.809998,839.809998,25391400
2667,2021-02-02,844.679993,880.500000,842.200012,872.789978,872.789978,24346200
2668,2021-02-03,877.020020,878.080017,853.059998,854.690002,854.690002,18247000


We can see some dates are missing.

In [None]:
df['Date'] = pd.DatetimeIndex(data=df['Date'])
dates = df['Date']

DAY = datetime.timedelta(days=1)
missing = [(d1+DAY, d2) for d1, d2 in zip(dates, dates[1:]) if (d2 - d1) > DAY]

missing[-10:]

[(Timestamp('2020-11-28 00:00:00'), Timestamp('2020-11-30 00:00:00')),
 (Timestamp('2020-12-05 00:00:00'), Timestamp('2020-12-07 00:00:00')),
 (Timestamp('2020-12-12 00:00:00'), Timestamp('2020-12-14 00:00:00')),
 (Timestamp('2020-12-19 00:00:00'), Timestamp('2020-12-21 00:00:00')),
 (Timestamp('2020-12-25 00:00:00'), Timestamp('2020-12-28 00:00:00')),
 (Timestamp('2021-01-01 00:00:00'), Timestamp('2021-01-04 00:00:00')),
 (Timestamp('2021-01-09 00:00:00'), Timestamp('2021-01-11 00:00:00')),
 (Timestamp('2021-01-16 00:00:00'), Timestamp('2021-01-19 00:00:00')),
 (Timestamp('2021-01-23 00:00:00'), Timestamp('2021-01-25 00:00:00')),
 (Timestamp('2021-01-30 00:00:00'), Timestamp('2021-02-01 00:00:00'))]

Most of the missing days are holiday and week-ends: they occur every 5 days and have a span of 2 days. That is when the stock market closes.

In [None]:
dff = df
dff["Date"] = pd.to_datetime(dff["Date"])
dates = dff['Date']
source = ColumnDataSource(data=dict(date=dates, close=dff['Close']))

p2 = figure(plot_height=300, plot_width=800, tools="xpan", toolbar_location=None,
           x_axis_type="datetime", x_axis_location="above",
           title = "Stock Market closing price graph for \""+company+"\"",
           background_fill_color="#efefef", x_range=(dates[dates.index[-100]], dates[dates.index[-1]]))

p2.line('date', 'close', source=source)
p2.yaxis.axis_label = 'Price'

hover_tool = HoverTool(
    tooltips=[
        ( 'date',   '@date{%F}'            ),
        ( 'close',  '$@{close}{%0.2f}' ), # use @{ } for field names with spaces
    ],

    formatters={
        '@date'        : 'datetime', # use 'datetime' formatter for '@date' field
        '@{close}' : 'printf',   # use 'printf' formatter for '@{adj close}' field
                                     # use default 'numeral' formatter for other fields
    },

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline'
)
p2.add_tools(hover_tool)

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

select = figure(title="Drag the middle and edges of the selection box to change the range above",
                plot_height=130, plot_width=800, y_range=p2.y_range, tools=TOOLS, 
                x_axis_type="datetime", y_axis_type=None,
                toolbar_location=None, background_fill_color="#efefef")

range_tool = RangeTool(x_range=p2.x_range)
range_tool.overlay.fill_color = "navy"
range_tool.overlay.fill_alpha = 0.2

select.line('date', 'close', source=source)
select.ygrid.grid_line_color = None
select.add_tools(range_tool, hover_tool)
select.toolbar.active_multi = range_tool

# save(column(p2, select), filename="plots/tsla_closing.html")
# output_notebook()
# show(column(p2, select))
bokeh.plotting.show(column(p2, select))

In [None]:
dff = df[-50:]
dff.loc[:, "Date"] = pd.to_datetime(dff["Date"])

inc = dff.Close > dff.Open
dec = dff.Open > dff.Close
w = 12*60*60*1000 # half day in ms

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=900,
           title = "Candlestick graph for Stock \""+company+"\"")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

inc_source = ColumnDataSource(data=dict(
    open1=dff.Open[inc],
    close1=dff.Close[inc],
    Date1=dff.Date[inc]
))

dec_source = ColumnDataSource(data=dict(
    open2=dff.Open[dec],
    close2=dff.Close[dec],
    Date2=dff.Date[dec]
))

p.segment(dff.Date, dff.High, dff.Date, dff.Low, color="black")
r1 = p.vbar(x='Date1', width=w, top='open1', bottom='close1', source=inc_source,
                fill_color="#D5E1DD", line_color="black")
r2 = p.vbar(x='Date2', width=w, top='open2', bottom='close2', source=dec_source,
                fill_color="#F2583E", line_color="black")

# Set up the hover tooltip to display some useful data
p.add_tools(HoverTool(
    renderers=[r1],
    tooltips=[
        ("Open", "$@open1"),
        ("Close", "$@close1"),
        ("Date", "@Date1{%F}"),
    ],
    formatters={
        '@Date1': 'datetime',
    }))

p.add_tools(HoverTool(
    renderers=[r2],
    tooltips=[
        ("Open", "$@open2"),
        ("Close", "$@close2"),
        ("Date", "@Date2{%F}")
    ],
    formatters={
        '@Date2': 'datetime'
    }))

# save(p, filename="plots/tsla_candlestick.html")
# output_notebook()
# show(p)
bokeh.plotting.show(p)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
