<a href="https://colab.research.google.com/github/ArchitGupta16/Stock-Price-Forecasting/blob/main/RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Library

In [None]:
import pandas as pd
import numpy as np
import math
import datetime as dt
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU

from itertools import cycle

# ! pip install plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Import dataset

In [None]:
# Import dataset
bist100 = pd.read_csv("/content/HCLTECH.csv")
bist100.head()

Unnamed: 0,Date,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-11,HCLTECH,EQ,580.0,1550.0,1725.0,1492.0,1560.0,1554.45,1582.72,1192200,188691500000000.0,,,
1,2000-01-12,HCLTECH,EQ,1554.45,1560.0,1678.85,1560.0,1678.85,1678.85,1657.05,344850,57143490000000.0,,,
2,2000-01-13,HCLTECH,EQ,1678.85,1790.0,1813.2,1781.0,1813.2,1813.2,1804.69,53000,9564880000000.0,,,
3,2000-01-14,HCLTECH,EQ,1813.2,1958.3,1958.3,1835.0,1958.3,1958.3,1939.9,270950,52561690000000.0,,,
4,2000-01-17,HCLTECH,EQ,1958.3,2115.0,2115.0,1801.65,1801.65,1801.65,1990.55,428800,85354730000000.0,,,


# Rename columns

In [None]:
# Rename columns
bist100.rename(columns={"Date":"date","Open":"open","High":"high","Low":"low","Close":"close"}, inplace= True)
bist100.head()

Unnamed: 0,date,Symbol,Series,Prev Close,open,high,low,Last,close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-11,HCLTECH,EQ,580.0,1550.0,1725.0,1492.0,1560.0,1554.45,1582.72,1192200,188691500000000.0,,,
1,2000-01-12,HCLTECH,EQ,1554.45,1560.0,1678.85,1560.0,1678.85,1678.85,1657.05,344850,57143490000000.0,,,
2,2000-01-13,HCLTECH,EQ,1678.85,1790.0,1813.2,1781.0,1813.2,1813.2,1804.69,53000,9564880000000.0,,,
3,2000-01-14,HCLTECH,EQ,1813.2,1958.3,1958.3,1835.0,1958.3,1958.3,1939.9,270950,52561690000000.0,,,
4,2000-01-17,HCLTECH,EQ,1958.3,2115.0,2115.0,1801.65,1801.65,1801.65,1990.55,428800,85354730000000.0,,,


# Checking null & na value

In [None]:
# Checking null value
bist100.isnull().sum()

date                     0
Symbol                   0
Series                   0
Prev Close               0
open                     0
high                     0
low                      0
Last                     0
close                    0
VWAP                     0
Volume                   0
Turnover                 0
Trades                2844
Deliverable Volume     503
%Deliverble            503
dtype: int64

In [None]:
# Checking na value
bist100.isna().any()

date                  False
Symbol                False
Series                False
Prev Close            False
open                  False
high                  False
low                   False
Last                  False
close                 False
VWAP                  False
Volume                False
Turnover              False
Trades                 True
Deliverable Volume     True
%Deliverble            True
dtype: bool

# Checking datatype of each column

In [None]:
# Checking Data type of each column
print("Date column data type: ", type(bist100['date'][0]))
print("Open column data type: ", type(bist100['open'][0]))
print("Close column data type: ", type(bist100['close'][0]))
print("High column data type: ", type(bist100['high'][0]))
print("Low column data type: ", type(bist100['low'][0]))

Date column data type:  <class 'str'>
Open column data type:  <class 'numpy.float64'>
Close column data type:  <class 'numpy.float64'>
High column data type:  <class 'numpy.float64'>
Low column data type:  <class 'numpy.float64'>


# Convert date from string to date format

In [None]:
# convert date field from string to Date format and make it index
bist100['date'] = pd.to_datetime(bist100.date)
bist100.head()

Unnamed: 0,date,Symbol,Series,Prev Close,open,high,low,Last,close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-11,HCLTECH,EQ,580.0,1550.0,1725.0,1492.0,1560.0,1554.45,1582.72,1192200,188691500000000.0,,,
1,2000-01-12,HCLTECH,EQ,1554.45,1560.0,1678.85,1560.0,1678.85,1678.85,1657.05,344850,57143490000000.0,,,
2,2000-01-13,HCLTECH,EQ,1678.85,1790.0,1813.2,1781.0,1813.2,1813.2,1804.69,53000,9564880000000.0,,,
3,2000-01-14,HCLTECH,EQ,1813.2,1958.3,1958.3,1835.0,1958.3,1958.3,1939.9,270950,52561690000000.0,,,
4,2000-01-17,HCLTECH,EQ,1958.3,2115.0,2115.0,1801.65,1801.65,1801.65,1990.55,428800,85354730000000.0,,,


# Sorting dataset by date format

In [None]:
bist100.sort_values(by='date', inplace=True)
bist100.head()

Unnamed: 0,date,Symbol,Series,Prev Close,open,high,low,Last,close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-11,HCLTECH,EQ,580.0,1550.0,1725.0,1492.0,1560.0,1554.45,1582.72,1192200,188691500000000.0,,,
1,2000-01-12,HCLTECH,EQ,1554.45,1560.0,1678.85,1560.0,1678.85,1678.85,1657.05,344850,57143490000000.0,,,
2,2000-01-13,HCLTECH,EQ,1678.85,1790.0,1813.2,1781.0,1813.2,1813.2,1804.69,53000,9564880000000.0,,,
3,2000-01-14,HCLTECH,EQ,1813.2,1958.3,1958.3,1835.0,1958.3,1958.3,1939.9,270950,52561690000000.0,,,
4,2000-01-17,HCLTECH,EQ,1958.3,2115.0,2115.0,1801.65,1801.65,1801.65,1990.55,428800,85354730000000.0,,,


In [None]:
bist100.shape

(5300, 15)

# Get the duration of dataset

In [None]:
print("Starting date: ",bist100.iloc[0][0])
print("Ending date: ", bist100.iloc[-1][0])
print("Duration: ", bist100.iloc[-1][0]-bist100.iloc[0][0])

Starting date:  2000-01-11 00:00:00
Ending date:  2021-04-30 00:00:00
Duration:  7780 days 00:00:00


# Monthwise comparision between Stock actual, open and close price

In [None]:
monthvise= bist100.groupby(bist100['date'].dt.strftime('%B'))[['open','close']].mean().sort_values(by='close')
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
             'September', 'October', 'November', 'December']
monthvise = monthvise.reindex(new_order, axis=0)
monthvise

Unnamed: 0_level_0,open,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1
January,676.485498,675.738745
February,746.55908,744.896092
March,695.657819,692.534251
April,626.147156,625.084479
May,605.29922,604.208686
June,599.603898,598.697773
July,603.250216,602.771212
August,616.028054,616.664819
September,644.219954,643.417169
October,633.867483,633.534266


In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=monthvise.index,
    y=monthvise['open'],
    name='Stock Open Price',
    marker_color='crimson'
))
fig.add_trace(go.Bar(
    x=monthvise.index,
    y=monthvise['close'],
    name='Stock Close Price',
    marker_color='lightsalmon'
))

fig.update_layout(barmode='group', xaxis_tickangle=-45,
                  title='Monthwise comparision between Stock actual, open and close price')
fig.show()

# Trend comparision between stock price, open price, close price, high price, low price

In [None]:
names = cycle(['Stock Open Price','Stock Close Price','Stock High Price','Stock Low Price'])

fig = px.line(bist100, x=bist100.date, y=[bist100['open'], bist100['close'],
                                          bist100['high'], bist100['low']],
             labels={'date': 'Date','value':'Stock value'})
fig.update_layout(title_text='Stock analysis chart', font_size=15, font_color='black',legend_title_text='Stock Parameters')
fig.for_each_trace(lambda t:  t.update(name = next(names)))
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.show()

# Make separate dataframe with close price

In [None]:
closedf = bist100[['date','close']]
print("Shape of close dataframe:", closedf.shape)

Shape of close dataframe: (5300, 2)


# Plotting stock close price chart

In [None]:
fig = px.line(closedf, x=closedf.date, y=closedf.close,labels={'date':'Date','close':'Close Stock'})
fig.update_traces(marker_line_width=2, opacity=0.6)
fig.update_layout(title_text='Stock close price chart', plot_bgcolor='white', font_size=15, font_color='black')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

# Normalizing / scaling close value between 0 to 1

In [None]:
close_stock = closedf.copy()
del closedf['date']
scaler=MinMaxScaler(feature_range=(0,1))
closedf=scaler.fit_transform(np.array(closedf).reshape(-1,1))
print(closedf.shape)

(5300, 1)


# Split data for training and testing

Ratio for training and testing data is 70:30

In [None]:
training_size=int(len(closedf)*0.70)
test_size=len(closedf)-training_size
train_data,test_data=closedf[0:training_size,:],closedf[training_size:len(closedf),:1]
print("train_data: ", train_data.shape)
print("test_data: ", test_data.shape)

train_data:  (3709, 1)
test_data:  (1591, 1)


# Create new dataset according to requirement of time-series prediction

In [None]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]   ###i=0, 0,1,2,3-----99   100
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 15
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

X_train:  (3693, 15)
y_train:  (3693,)
X_test:  (1575, 15)
y_test (1575,)


# Algorithms

# Random Forest Regressor - RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
# Lets Do the prediction

train_predict=regressor.predict(X_train)
test_predict=regressor.predict(X_test)

train_predict = train_predict.reshape(-1,1)
test_predict = test_predict.reshape(-1,1)

print("Train data prediction:", train_predict.shape)
print("Test data prediction:", test_predict.shape)

Train data prediction: (3693, 1)
Test data prediction: (1575, 1)


In [None]:
# Transform back to original form

train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1))
original_ytest = scaler.inverse_transform(y_test.reshape(-1,1))

Evaluation metrices RMSE, MSE and MAE

Root Mean Square Error (RMSE), Mean Square Error (MSE) and Mean absolute Error (MAE) are a standard way to measure the error of a model in predicting quantitative data.

In [None]:
# Evaluation metrices RMSE and MAE
print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain,train_predict)))
print("Train data MSE: ", mean_squared_error(original_ytrain,train_predict))
print("Test data MAE: ", mean_absolute_error(original_ytrain,train_predict))
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest,test_predict)))
print("Test data MSE: ", mean_squared_error(original_ytest,test_predict))
print("Test data MAE: ", mean_absolute_error(original_ytest,test_predict))

Train data RMSE:  11.073895068179297
Train data MSE:  122.63115198104578
Test data MAE:  4.796847278635258
-------------------------------------------------------------------------------------
Test data RMSE:  39.99020851653399
Test data MSE:  1599.2167771958677
Test data MAE:  20.31981333333326


Explained variance regression score

The explained variance score explains the dispersion of errors of a given dataset, and the formula is written as follows: Here, and Var(y) is the variance of prediction errors and actual values respectively. Scores close to 1.0 are highly desired, indicating better squares of standard deviations of errors.

In [None]:
print("Train data explained variance regression score:", explained_variance_score(original_ytrain, train_predict))
print("Test data explained variance regression score:", explained_variance_score(original_ytest, test_predict))

Train data explained variance regression score: 0.9992426303472598
Test data explained variance regression score: 0.9740573572039966


R2 score for regression

R-squared (R2) is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by an independent variable or variables in a regression model.

1 = Best

0 or < 0 = worse

In [None]:
print("Train data R2 score:", r2_score(original_ytrain, train_predict))
print("Test data R2 score:", r2_score(original_ytest, test_predict))

Train data R2 score: 0.9992426246946053
Test data R2 score: 0.9718023733598554


Regression Loss Mean Gamma deviance regression loss (MGD) and Mean Poisson deviance regression loss (MPD)

In [None]:
print("Train data MGD: ", mean_gamma_deviance(original_ytrain, train_predict))
print("Test data MGD: ", mean_gamma_deviance(original_ytest, test_predict))
print("----------------------------------------------------------------------")
print("Train data MPD: ", mean_poisson_deviance(original_ytrain, train_predict))
print("Test data MPD: ", mean_poisson_deviance(original_ytest, test_predict))

Train data MGD:  0.0002307855738409278
Test data MGD:  0.0012548126849573113
----------------------------------------------------------------------
Train data MPD:  0.1280582110268222
Test data MPD:  1.3341012174329518


# Comparision between original stock close price vs predicted close price

In [None]:
# shift train predictions for plotting

look_back=time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
print("Train predicted data: ", trainPredictPlot.shape)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict)+(look_back*2)+1:len(closedf)-1, :] = test_predict
print("Test predicted data: ", testPredictPlot.shape)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])


plotdf = pd.DataFrame({'date': close_stock['date'],
                       'original_close': close_stock['close'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

Train predicted data:  (5300, 1)
Test predicted data:  (5300, 1)


# Predicting next 10 days

In [None]:
x_input=test_data[len(test_data)-time_step:].reshape(1,-1)
temp_input=list(x_input)
temp_input=temp_input[0].tolist()

from numpy import array

lst_output=[]
n_steps=time_step
i=0
pred_days = 10
while(i<pred_days):

    if(len(temp_input)>time_step):

        x_input=np.array(temp_input[1:])
        #print("{} day input {}".format(i,x_input))
        x_input=x_input.reshape(1,-1)

        yhat = regressor.predict(x_input)
        #print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat.tolist())
        temp_input=temp_input[1:]

        lst_output.extend(yhat.tolist())
        i=i+1

    else:
        yhat = regressor.predict(x_input)

        temp_input.extend(yhat.tolist())
        lst_output.extend(yhat.tolist())

        i=i+1

print("Output of predicted next days: ", len(lst_output))

Output of predicted next days:  10


# Plotting last 15 days and next predicted 10 days

In [None]:
last_days=np.arange(1,time_step+1)
day_pred=np.arange(time_step+1,time_step+pred_days+1)
print(last_days)
print(day_pred)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23 24 25]


In [None]:
temp_mat = np.empty((len(last_days)+pred_days+1,1))
temp_mat[:] = np.nan
temp_mat = temp_mat.reshape(1,-1).tolist()[0]

last_original_days_value = temp_mat
next_predicted_days_value = temp_mat

last_original_days_value[0:time_step+1] = scaler.inverse_transform(closedf[len(closedf)-time_step:]).reshape(1,-1).tolist()[0]
next_predicted_days_value[time_step+1:] = scaler.inverse_transform(np.array(lst_output).reshape(-1,1)).reshape(1,-1).tolist()[0]

names = cycle(['Last 15 days close price','Predicted next 10 days close price'])

new_pred_plot = pd.DataFrame({
    'last_original_days_value':last_original_days_value,
    'next_predicted_days_value':next_predicted_days_value
})

fig = px.line(new_pred_plot,x=new_pred_plot.index, y=[new_pred_plot['last_original_days_value'],
                                                      new_pred_plot['next_predicted_days_value']],
              labels={'value': 'Stock price','index': 'Timestamp'})
fig.update_layout(title_text='Compare last 15 days vs next 10 days',
                  plot_bgcolor='white', font_size=15, font_color='black',legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

# Plotting whole closing stock price with prediction

In [None]:
rfdf=closedf.tolist()
rfdf.extend((np.array(lst_output).reshape(-1,1)).tolist())
rfdf=scaler.inverse_transform(rfdf).reshape(1,-1).tolist()[0]

names = cycle(['Close price'])

fig = px.line(rfdf,labels={'value': 'Stock price','index': 'Timestamp'})
fig.update_layout(title_text='Plotting whole closing stock price with prediction',
                  plot_bgcolor='white', font_size=15, font_color='black',legend_title_text='Stock')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()