In [0]:
# Some pre-requisits for the Kaggle api
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

## Midterm Assignment (Linear Regression for Price Prediction)

Write a Python script that uses linear regression to predict the price of a stock. Pick any company you’d like. This is a fun exercise to learn about data preprocessing, python, and using machine learning libraries like sci-kit learn. Submit your github repository to gradedhomeworkassignments@gmail.com for a grade! This is a required assignment to recieve a certificate. Use stock data from any available website. Bonus points for documenting your code well. Here is a tutorial that will help guide you https://programmingforfinance.com/2018/01/predicting-stock-prices-with-linear-regression/ good luck!

## In this notebook I'll be using Linear Regression to forecast a stock closing price

### Link for the Google colab public notebook: https://colab.research.google.com/drive/1D7AhdyOGBXCA1t0w5pSrLhHRR7Ukt7EX

### Dependencies

In [35]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)

#### Function required to use plotly

In [0]:
#@title
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

#### Upload Kaggle json

In [2]:
#@title
# Colab's file access feature
from google.colab import files

#retrieve uploaded file
uploaded = files.upload()

#print results
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 71 bytes


#### Download dataset

In [9]:
#@title
!kaggle datasets download -d dgawlik/nyse

Downloading nyse.zip to /content
 76% 25.0M/32.8M [00:00<00:00, 40.0MB/s]
100% 32.8M/32.8M [00:00<00:00, 93.9MB/s]


### Unzip files

In [10]:
#@title
!unzip nyse.zip
!ls

Archive:  nyse.zip
  inflating: fundamentals.csv        
  inflating: prices-split-adjusted.csv  
  inflating: prices.csv              
  inflating: securities.csv          
ETFs		  nyse.zip    prices-split-adjusted.csv  securities.csv
fundamentals.csv  prices.csv  sample_data		 Stocks


### Load stock data
* I'm loading only the "close" value because thats what I'll be forecasting

In [0]:
stock_prices = pd.read_csv('prices-split-adjusted.csv', parse_dates=['date'], usecols=['date', 'symbol', 'close'])

### Stock data overview

In [234]:
print('Stock prices data shape:', stock_prices.shape)
display(stock_prices.head())
display(stock_prices.describe())
display(stock_prices[['date', 'symbol']].describe())

Stock prices data shape: (851264, 3)


Unnamed: 0,date,symbol,close
0,2016-01-05,WLTW,125.839996
1,2016-01-06,WLTW,119.980003
2,2016-01-07,WLTW,114.949997
3,2016-01-08,WLTW,116.620003
4,2016-01-11,WLTW,114.970001


Unnamed: 0,close
count,851264.0
mean,65.011913
std,75.201216
min,1.59
25%,31.292776
50%,48.48
75%,75.139999
max,1578.130005


Unnamed: 0,date,symbol
count,851264,851264
unique,1762,501
top,2016-08-11 00:00:00,COG
freq,501,1762
first,2010-01-04 00:00:00,
last,2016-12-30 00:00:00,


For this exercise I will use only the stock prices of 2016 for one company "COG" (the most frequent), this way the prices should be more stable.

In [235]:
cog_stock = stock_prices[stock_prices['symbol'] == 'COG']
cog_stock = cog_stock[cog_stock['date'] >= '2016-01-01']

print('COG stock prices data shape:', cog_stock.shape)
display(cog_stock.head())
display(cog_stock.describe())
display(cog_stock[['date', 'symbol']].describe())

COG stock prices data shape: (252, 3)


Unnamed: 0,date,symbol,close
725491,2016-01-04,COG,17.59
725990,2016-01-05,COG,17.57
726489,2016-01-06,COG,16.02
726988,2016-01-07,COG,16.58
727487,2016-01-08,COG,17.200001


Unnamed: 0,close
count,252.0
mean,22.92127
std,2.305573
min,15.48
25%,21.785001
50%,23.32
75%,24.6525
max,26.5


Unnamed: 0,date,symbol
count,252,252
unique,252,1
top,2016-09-08 00:00:00,COG
freq,1,252
first,2016-01-04 00:00:00,
last,2016-12-30 00:00:00,


Let's see this stock price time range

In [236]:
#@title
print('Min date from data set: %s' % cog_stock['date'].min().date())
print('Max date from data set: %s' % cog_stock['date'].max().date())

Min date from data set: 2016-01-04
Max date from data set: 2016-12-30


### Now let's take a look at the complete "close" values history

In [237]:
#@title
configure_plotly_browser_state()  # display plotly plots

close_sc = go.Scatter(x=cog_stock['date'], y=cog_stock['close'])
layout = go.Layout(title='Stocks close price', xaxis=dict(title='Date'), yaxis=dict(title='Close price'))
fig = go.Figure(data=[close_sc], layout=layout)
iplot(fig)

### Pre process the dataset to add for each row the "close" value for the day before and our label that will be the next day.

In [0]:
cog_stock['yesterday_close'] = cog_stock['close'].shift(1)
cog_stock['tomorrow_close'] = cog_stock['close'].shift(-1)
cog_stock.dropna(axis=0, inplace=True)

### The model will be trained on the first 9 months and I'll be forecasting the prices for September and months after

In [239]:
train = cog_stock[cog_stock['date'] < '2016-09-01']
test = cog_stock[cog_stock['date'] >= '2016-09-01']
print('Min date from train set: %s' % train['date'].min().date())
print('Max date from train set: %s' % train['date'].max().date())
print('Min date from test set: %s' % test['date'].min().date())
print('Max date from test set: %s' % test['date'].max().date())

Min date from train set: 2016-01-05
Max date from train set: 2016-08-31
Min date from test set: 2016-09-01
Max date from test set: 2016-12-29


### Scaling the data, to better fit the linear model

In [0]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train[['yesterday_close', 'close']])
X_test = scaler.transform(test[['yesterday_close', 'close']])

### Model training

In [246]:
model = LinearRegression(n_jobs=-1)
model.fit(X_train, train['tomorrow_close'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

### Make prediction on the test set

In [247]:
test['predicted'] = model.predict(X_test)
test.head()

Unnamed: 0,date,symbol,close,yesterday_close,tomorrow_close,predicted
809365,2016-09-01,COG,24.459999,24.629999,24.41,24.467019
809865,2016-09-02,COG,24.41,24.459999,24.77,24.391682
810365,2016-09-06,COG,24.77,24.41,25.25,24.644323
810865,2016-09-07,COG,25.25,24.77,26.5,25.078229
811365,2016-09-08,COG,26.5,25.25,26.059999,26.104126


In [0]:
# Join train and test sets to plot results
train = train.append(test)

In [248]:
#@title
configure_plotly_browser_state()  # display plotly plots

stocks = [go.Scatter(x=train['date'], y=train['close'], name=('Real')), 
          go.Scatter(x=train['date'], y=train['predicted'], name=('Predicted'))]

layout = go.Layout(title='Stocks close price', xaxis=dict(title='Date'), yaxis=dict(title='Close price'))
fig = go.Figure(data=stocks, layout=layout)
iplot(fig)

As we can see even with little training data we had a pretty decent result