In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/apple-stock-dataset/apple_stock_data.csv


In [6]:
data = pd.read_csv("/kaggle/input/apple-stock-dataset/apple_stock_data.csv")

In [7]:
data.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,2023-11-02 00:00:00+00:00,176.665985,177.570007,177.779999,175.460007,175.520004,77334800
1,2023-11-03 00:00:00+00:00,175.750671,176.649994,176.820007,173.350006,174.240005,79763700
2,2023-11-06 00:00:00+00:00,178.31752,179.229996,179.429993,176.210007,176.380005,63841300
3,2023-11-07 00:00:00+00:00,180.894333,181.820007,182.440002,178.970001,179.179993,70530000
4,2023-11-08 00:00:00+00:00,181.958893,182.889999,183.449997,181.589996,182.350006,49340300


In [8]:
## Convert data column into datetime type

In [9]:
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date',inplace = True)

In [10]:
data.head()

Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-11-02 00:00:00+00:00,176.665985,177.570007,177.779999,175.460007,175.520004,77334800
2023-11-03 00:00:00+00:00,175.750671,176.649994,176.820007,173.350006,174.240005,79763700
2023-11-06 00:00:00+00:00,178.31752,179.229996,179.429993,176.210007,176.380005,63841300
2023-11-07 00:00:00+00:00,180.894333,181.820007,182.440002,178.970001,179.179993,70530000
2023-11-08 00:00:00+00:00,181.958893,182.889999,183.449997,181.589996,182.350006,49340300


In [11]:
## Focus on close price

In [12]:
data = data[['Close']]

In [13]:
data.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-11-02 00:00:00+00:00,177.570007
2023-11-03 00:00:00+00:00,176.649994
2023-11-06 00:00:00+00:00,179.229996
2023-11-07 00:00:00+00:00,181.820007
2023-11-08 00:00:00+00:00,182.889999


In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
data['Close'] = scaler.fit_transform(data[['Close']])

In [15]:
data.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-11-02 00:00:00+00:00,0.175853
2023-11-03 00:00:00+00:00,0.162983
2023-11-06 00:00:00+00:00,0.199077
2023-11-07 00:00:00+00:00,0.235311
2023-11-08 00:00:00+00:00,0.25028


In [16]:
## Preparing data for LSTM by creating sequences of a defined length e.g.60 days

In [17]:
import numpy as np


In [18]:
def create_sequences(data,seq_length = 60):
    X,y = [],[]
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X),np.array(y)

In [19]:
seq_length = 60
X,y = create_sequences(data['Close'].values,seq_length)

In [20]:
X

array([[0.1758535 , 0.16298258, 0.19907662, ..., 0.3836038 , 0.37395072,
        0.32232785],
       [0.16298258, 0.19907662, 0.23531069, ..., 0.37395072, 0.32232785,
        0.27140452],
       [0.19907662, 0.23531069, 0.2502798 , ..., 0.32232785, 0.27140452,
        0.30581984],
       ...,
       [0.5907946 , 0.62702868, 0.67585339, ..., 0.92907118, 0.956911  ,
        0.96068834],
       [0.62702868, 0.67585339, 0.71684399, ..., 0.956911  , 0.96068834,
        0.9107444 ],
       [0.67585339, 0.71684399, 0.73489091, ..., 0.96068834, 0.9107444 ,
        0.85212657]])

In [21]:
y

array([0.27140452, 0.30581984, 0.29169009, 0.31729147, 0.3399553 ,
       0.3414942 , 0.32624523, 0.33365987, 0.30987682, 0.28035806,
       0.26790704, 0.26385005, 0.24216562, 0.23167317, 0.24230566,
       0.27098484, 0.2451036 , 0.22607729, 0.2466425 , 0.22971459,
       0.22034137, 0.2050924 , 0.14129836, 0.07162836, 0.05763844,
       0.05595971, 0.08016223, 0.10842194, 0.11513705, 0.08575833,
       0.11191942, 0.10660318, 0.12199219, 0.15500843, 0.19124229,
       0.08911577, 0.10184666, 0.08184116, 0.06589266, 0.11625627,
       0.09065467, 0.07036932, 0.05372127, 0.06505308, 0.05344163,
       0.0640739 , 0.04826521, 0.06533294, 0.03889198, 0.14045878,
       0.16158371, 0.10758258, 0.06127595, 0.04196978, 0.02853936,
       0.        , 0.01175149, 0.02658078, 0.05623957, 0.06841074,
       0.06015673, 0.11891439, 0.07456634, 0.06015673, 0.11233911,
       0.25713495, 0.23377179, 0.24342466, 0.2481814 , 0.273783  ,
       0.25251824, 0.29770565, 0.31379398, 0.34583104, 0.34750

In [22]:
## Splitting the sequences into trainig and test setss

In [23]:
train_size = int(len(X)*0.8)

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [24]:
X_train

array([[0.1758535 , 0.16298258, 0.19907662, ..., 0.3836038 , 0.37395072,
        0.32232785],
       [0.16298258, 0.19907662, 0.23531069, ..., 0.37395072, 0.32232785,
        0.27140452],
       [0.19907662, 0.23531069, 0.2502798 , ..., 0.32232785, 0.27140452,
        0.30581984],
       ...,
       [0.39339671, 0.58967538, 0.67249595, ..., 0.8953554 , 0.80819821,
        0.78133757],
       [0.58967538, 0.67249595, 0.68886414, ..., 0.80819821, 0.78133757,
        0.80274214],
       [0.67249595, 0.68886414, 0.66438176, ..., 0.78133757, 0.80274214,
        0.78091789]])

In [25]:
y_train

array([0.27140452, 0.30581984, 0.29169009, 0.31729147, 0.3399553 ,
       0.3414942 , 0.32624523, 0.33365987, 0.30987682, 0.28035806,
       0.26790704, 0.26385005, 0.24216562, 0.23167317, 0.24230566,
       0.27098484, 0.2451036 , 0.22607729, 0.2466425 , 0.22971459,
       0.22034137, 0.2050924 , 0.14129836, 0.07162836, 0.05763844,
       0.05595971, 0.08016223, 0.10842194, 0.11513705, 0.08575833,
       0.11191942, 0.10660318, 0.12199219, 0.15500843, 0.19124229,
       0.08911577, 0.10184666, 0.08184116, 0.06589266, 0.11625627,
       0.09065467, 0.07036932, 0.05372127, 0.06505308, 0.05344163,
       0.0640739 , 0.04826521, 0.06533294, 0.03889198, 0.14045878,
       0.16158371, 0.10758258, 0.06127595, 0.04196978, 0.02853936,
       0.        , 0.01175149, 0.02658078, 0.05623957, 0.06841074,
       0.06015673, 0.11891439, 0.07456634, 0.06015673, 0.11233911,
       0.25713495, 0.23377179, 0.24342466, 0.2481814 , 0.273783  ,
       0.25251824, 0.29770565, 0.31379398, 0.34583104, 0.34750

In [26]:
## Building a sequential LSTM model with layers to capture the temporal dependencies

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [28]:
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(1))

  super().__init__(**kwargs)


In [29]:
lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.fit(X_train, y_train, epochs=20, batch_size=32)

Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 60ms/step - loss: 0.2479
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.0493
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.0473
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.0162
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.0215
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 0.0172
Epoch 7/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.0133
Epoch 8/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.0108
Epoch 9/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.0116
Epoch 10/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - loss: 0.0130
Epoch 11/20
[1m5/5

<keras.src.callbacks.history.History at 0x7a4e9b9f67d0>

In [30]:

'''Now, let’s train the second model. I’ll start by generating lagged features for Linear Regression (e.g., using the past 3 days as predictors)'''

'Now, let’s train the second model. I’ll start by generating lagged features for Linear Regression (e.g., using the past 3 days as predictors)'

In [31]:
data['Lag_1'] = data['Close'].shift(1)
data['Lag_2'] = data['Close'].shift(2)
data['Lag_3'] = data['Close'].shift(3)


In [32]:
data

Unnamed: 0_level_0,Close,Lag_1,Lag_2,Lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-11-02 00:00:00+00:00,0.175853,,,
2023-11-03 00:00:00+00:00,0.162983,0.175853,,
2023-11-06 00:00:00+00:00,0.199077,0.162983,0.175853,
2023-11-07 00:00:00+00:00,0.235311,0.199077,0.162983,0.175853
2023-11-08 00:00:00+00:00,0.250280,0.235311,0.199077,0.162983
...,...,...,...,...
2024-10-28 00:00:00+00:00,0.956911,0.929071,0.917320,0.919978
2024-10-29 00:00:00+00:00,0.960688,0.956911,0.929071,0.917320
2024-10-30 00:00:00+00:00,0.910744,0.960688,0.956911,0.929071
2024-10-31 00:00:00+00:00,0.852127,0.910744,0.960688,0.956911


In [33]:
data = data.dropna()

In [34]:
data

Unnamed: 0_level_0,Close,Lag_1,Lag_2,Lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-11-07 00:00:00+00:00,0.235311,0.199077,0.162983,0.175853
2023-11-08 00:00:00+00:00,0.250280,0.235311,0.199077,0.162983
2023-11-09 00:00:00+00:00,0.243565,0.250280,0.235311,0.199077
2023-11-10 00:00:00+00:00,0.299384,0.243565,0.250280,0.235311
2023-11-13 00:00:00+00:00,0.277001,0.299384,0.243565,0.250280
...,...,...,...,...
2024-10-28 00:00:00+00:00,0.956911,0.929071,0.917320,0.919978
2024-10-29 00:00:00+00:00,0.960688,0.956911,0.929071,0.917320
2024-10-30 00:00:00+00:00,0.910744,0.960688,0.956911,0.929071
2024-10-31 00:00:00+00:00,0.852127,0.910744,0.960688,0.956911


In [35]:
# Now, we will split the data accordingly for training and testing

In [37]:
X_lin = data[['Lag_1', 'Lag_2', 'Lag_3']]
y_lin = data['Close']
X_train_lin, X_test_lin = X_lin[:train_size], X_lin[train_size:]
y_train_lin, y_test_lin = y_lin[:train_size], y_lin[train_size:]

In [38]:
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()
lin_model.fit(X_train_lin, y_train_lin)

In [39]:
## Make prediction using lstm on test dataset

In [40]:
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
lstm_predictions = lstm_model.predict(X_test_lstm)
lstm_predictions = scaler.inverse_transform(lstm_predictions)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 295ms/step


In [41]:
# generate predictions using Linear Regression and inverse-transform them

In [42]:
lin_predictions = lin_model.predict(X_test_lin)
lin_predictions = scaler.inverse_transform(lin_predictions.reshape(-1, 1))

In [43]:
## use a weighted average to create hybrid predictions

In [48]:
hybrid_predictions = (0.7 * lstm_predictions) + (0.3 * lin_predictions)

ValueError: operands could not be broadcast together with shapes (39,1) (96,1) 

In [49]:
## to predict the Next 10 Days using LSTM

In [50]:
lstm_future_predictions = []
last_sequence = X[-1].reshape(1, seq_length, 1)
for _ in range(10):
    lstm_pred = lstm_model.predict(last_sequence)[0, 0]
    lstm_future_predictions.append(lstm_pred)
    lstm_pred_reshaped = np.array([[lstm_pred]]).reshape(1, 1, 1)
    last_sequence = np.append(last_sequence[:, 1:, :], lstm_pred_reshaped, axis=1)
lstm_future_predictions = scaler.inverse_transform(np.array(lstm_future_predictions).reshape(-1, 1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


In [51]:
lstm_future_predictions

array([[231.6761 ],
       [231.4114 ],
       [231.17833],
       [230.96507],
       [230.76309],
       [230.56725],
       [230.3735 ],
       [230.17976],
       [229.98473],
       [229.78735]], dtype=float32)

In [52]:
#Next 10 Days using Linear Regression

In [53]:
recent_data = data['Close'].values[-3:]
lin_future_predictions = []
for _ in range(10):
    lin_pred = lin_model.predict(recent_data.reshape(1, -1))[0]
    lin_future_predictions.append(lin_pred)
    recent_data = np.append(recent_data[1:], lin_pred)
lin_future_predictions = scaler.inverse_transform(np.array(lin_future_predictions).reshape(-1, 1))



In [54]:
#to combine the predictive power of both models to make predictions for the next 10 days

In [55]:
hybrid_future_predictions = (0.7 * lstm_future_predictions) + (0.3 * lin_future_predictions)

In [56]:
hybrid_future_predictions

array([[231.27982109],
       [229.70016338],
       [228.635857  ],
       [230.86501287],
       [229.18007856],
       [228.14544048],
       [230.54050269],
       [228.69951881],
       [227.67450583],
       [230.22686314]])

In [57]:
future_dates = pd.date_range(start=data.index[-1] + pd.Timedelta(days=1), periods=10)
predictions_df = pd.DataFrame({
    'Date': future_dates,
    'LSTM Predictions': lstm_future_predictions.flatten(),
    'Linear Regression Predictions': lin_future_predictions.flatten(),
    'Hybrid Model Predictions': hybrid_future_predictions.flatten()
})
print(predictions_df)

                       Date  LSTM Predictions  Linear Regression Predictions  \
0 2024-11-02 00:00:00+00:00        231.676102                     230.355192   
1 2024-11-03 00:00:00+00:00        231.411407                     225.707291   
2 2024-11-04 00:00:00+00:00        231.178329                     222.703426   
3 2024-11-05 00:00:00+00:00        230.965073                     230.631535   
4 2024-11-06 00:00:00+00:00        230.763092                     225.486380   
5 2024-11-07 00:00:00+00:00        230.567245                     222.494588   
6 2024-11-08 00:00:00+00:00        230.373505                     230.930195   
7 2024-11-09 00:00:00+00:00        230.179764                     225.245599   
8 2024-11-10 00:00:00+00:00        229.984726                     222.284007   
9 2024-11-11 00:00:00+00:00        229.787354                     231.252375   

   Hybrid Model Predictions  
0                231.279821  
1                229.700163  
2                228.635857  

In [1]:
lstm_model.save('lstm_model.h5')

NameError: name 'lstm_model' is not defined

In [None]:
import joblib
joblib.dump(lin_model, 'linear_model.pkl')
