#TSLA Stock market prediction


In [17]:
# first install the yahoo finance library if you don't have it already
!pip install yfinance



In [18]:
import numpy as np
import yfinance as yf

In [19]:
tsla = yf.Ticker("TSLA")
print(tsla)
# uncomment to get all data
tsla.history(period="max")


yfinance.Ticker object <TSLA>


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,19.00,25.00,17.54,23.89,18766300,0,0
2010-06-30,25.79,30.42,23.30,23.83,17187100,0,0
2010-07-01,25.00,25.92,20.27,21.96,8218800,0,0
2010-07-02,23.00,23.10,18.71,19.20,5139800,0,0
2010-07-06,20.00,20.00,15.83,16.11,6866900,0,0
...,...,...,...,...,...,...,...
2020-01-28,568.49,576.81,558.08,566.90,11788500,0,0
2020-01-29,575.69,589.80,567.43,580.99,17801500,0,0
2020-01-30,632.42,650.88,618.00,640.81,29005700,0,0
2020-01-31,640.00,653.00,632.52,650.57,15719300,0,0


### Prepare data for training

In [20]:
stocks = tsla.history(start='2018-01-01', end='2018-06-05')
stocks

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-02,312.00,322.11,311.00,320.53,4352200,0,0
2018-01-03,321.00,325.25,315.55,317.25,4521500,0,0
2018-01-04,312.87,318.55,305.68,314.62,9946300,0,0
2018-01-05,316.62,317.24,312.00,316.58,4591200,0,0
2018-01-08,316.00,337.02,315.50,336.41,9859400,0,0
...,...,...,...,...,...,...,...
2018-05-29,278.51,286.50,276.15,283.76,5666600,0,0
2018-05-30,283.29,295.01,281.60,291.72,7489700,0,0
2018-05-31,287.21,290.37,282.93,284.73,5919700,0,0
2018-06-01,285.86,291.95,283.84,291.82,5424400,0,0


### Observation:
There are 34 days of data missing

In [21]:
# drop unneccessary columns
print(stocks.columns)
stocks.drop(columns='High Low Volume Dividends'.split()+["Stock Splits"], inplace=True)
stocks.head()

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')


Unnamed: 0_level_0,Open,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,312.0,320.53
2018-01-03,321.0,317.25


In [25]:
# create label indicating if stock market has grown during the day (Close > Open)
stocks['PriceUp'] = stocks['Open'] < stocks['Close']
stocks.head()

Unnamed: 0_level_0,Open,Close,PriceUp
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02,312.0,320.53,True
2018-01-03,321.0,317.25,False
2018-01-04,312.87,314.62,True
2018-01-05,316.62,316.58,False
2018-01-08,316.0,336.41,True


In [27]:
# check if the new column has equally distributed values
stocks.PriceUp.value_counts()

True     55
False    51
Name: PriceUp, dtype: int64

### Discussion
We have an even distribution between positive and negative examples, which is good for training.

In [28]:
# check stocks again before saving
stocks

Unnamed: 0_level_0,Open,Close,PriceUp
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02,312.00,320.53,True
2018-01-03,321.00,317.25,False
2018-01-04,312.87,314.62,True
2018-01-05,316.62,316.58,False
2018-01-08,316.00,336.41,True
...,...,...,...
2018-05-29,278.51,283.76,True
2018-05-30,283.29,291.72,True
2018-05-31,287.21,284.73,False
2018-06-01,285.86,291.82,True


In [29]:
stocks.to_json('../data/processed/stock/stocks_cleaned.json')

In [39]:
# uncomment to get the profile of the data
# import pandas_profiling as profile
# profile = profile.ProfileReport(stocks, title='Pandas Profiling Report', html={'style':{'full_width':True}})
# profile.to_file(output_file="stocks_df_report.html")
# profile.to_notebook_iframe()