In [24]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction')
from config import *

import yfinance as yf
import pandas as pd

### **Get historical data for S&P 500**

In [6]:


def fetch_stock_data(symbol='^GSPC', end='2024-12-31'):
   """Fetch all available historical data until end date"""
   df = yf.Ticker(symbol).history(period='max', end=end)
   df.to_csv(f"{RAW_DATA_PATH}/{symbol}_data.csv")
   print(f"Data saved to {RAW_DATA_PATH}/{symbol}_data.csv")
   print(f"Date range: {df.index[0]} to {df.index[-1]}")
   print(f"Shape: {df.shape}")
   return df

# Fetch data
raw_data = fetch_stock_data()

Data saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/raw/^GSPC_data.csv
Date range: 1927-12-30 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
Shape: (24366, 7)


### **Load saved raw data**

In [7]:
from bokeh.plotting import figure, show, output_notebook
output_notebook()

def plot_timeseries(df, title='S&P 500 Historical Prices', x_col='Date', y_col='Close'):
   output_notebook()

   p = figure(width=800, height=400, x_axis_type='datetime', title=title)
   p.line(df.index, df[y_col], line_width=2)

   p.xaxis.axis_label = x_col
   p.yaxis.axis_label = y_col
   p.grid.grid_line_alpha = 0.3

   show(p)

In [8]:

raw_data = pd.read_csv(f"{RAW_DATA_PATH}/^GSPC_data.csv", index_col='Date', parse_dates=True)
print(f"Loaded data shape: {raw_data.shape}\nDate range: {raw_data.index[0]} to {raw_data.index[-1]}")
print(raw_data.tail())

df=raw_data.copy()

Loaded data shape: (24366, 7)
Date range: 1927-12-30 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
                                  Open         High          Low        Close  \
Date                                                                            
2024-12-23 00:00:00-05:00  5940.250000  5978.250000  5902.569824  5974.069824   
2024-12-24 00:00:00-05:00  5984.629883  6040.100098  5981.439941  6040.040039   
2024-12-26 00:00:00-05:00  6024.970215  6049.750000  6007.370117  6037.589844   
2024-12-27 00:00:00-05:00  6006.169922  6006.169922  5932.950195  5970.839844   
2024-12-30 00:00:00-05:00  5920.669922  5940.790039  5869.160156  5906.939941   

                               Volume  Dividends  Stock Splits  
Date                                                            
2024-12-23 00:00:00-05:00  3593280000        0.0           0.0  
2024-12-24 00:00:00-05:00  1757720000        0.0           0.0  
2024-12-26 00:00:00-05:00  2904530000        0.0           0.0  
2024-12-27

In [9]:
plot_timeseries(df)

In [10]:
# Check data types
df.dtypes

Unnamed: 0,0
Open,float64
High,float64
Low,float64
Close,float64
Volume,int64
Dividends,float64
Stock Splits,float64


In [11]:
def preprocess_data(df):
   df = df[['Close']].copy()


   scaler = MinMaxScaler()
   df['Scaled'] = scaler.fit_transform(df[['Close_SubtractedFromFirst']])
   df['Target'] = df['Scaled'].shift(-1)
   df.dropna(inplace=True)

   df.to_csv(f"{PROCESSED_DATA_PATH}/preprocessed_data.csv")
   return df

In [28]:
!git reflog

[33m1b789ca[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m)[m HEAD@{0}: commit: Move files from Kaggle to colab
[33m3a8529a[m HEAD@{1}: reset: moving to 3a8529a
[33mfdd0f8f[m HEAD@{2}: reset: moving to HEAD
[33mfdd0f8f[m HEAD@{3}: reset: moving to HEAD
[33mfdd0f8f[m HEAD@{4}: reset: moving to HEAD
[33mfdd0f8f[m HEAD@{5}: commit: Move files from kaggle to colab
[33m6c7d8f8[m HEAD@{6}: commit: Removed leaked secret from Git history
[33m917996e[m HEAD@{7}: commit: Initial commit from colab
[33m3a8529a[m HEAD@{8}: commit: Initial commit from colab
[33m987eb90[m HEAD@{9}: pull origin main --rebase (finish): returning to refs/heads/main
[33m987eb90[m HEAD@{10}: pull origin main --rebase (pick): Initial Commit
[33m2ad84c6[m HEAD@{11}: pull origin main --rebase (pick): Initial commit
[33m1366c55[m[33m ([m[1;31morigin/main[m[33m)[m HEAD@{12}: pull origin main --rebase (start): checkout 1366c55efd30ca3b5e0ae7b682f283df3dbc44e0
[33mdedd216[m HEAD@{13}: commit:

In [29]:
!git reset --hard 5ec654c

HEAD is now at 5ec654c Initial commit


In [27]:
import os

# Get secret from environment variable
token = os.environ.get('PAT')

# Configure git
!git config --global user.email "bojte.csongi12@gmail.com"
!git config --global user.name "CsongiBojte"

# Set remote with token
!git remote set-url origin https://$GITHUB_TOKEN@github.com/CsongiBojte/Stock-market-prediction-using-neural-network.git

# Push
!git add .
!git commit -m "commit message"
!git push origin main

Enter GitHub PAT: ··········
[main 1b789ca] Move files from Kaggle to colab
 6 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 models/.gitkeep
 create mode 100644 models/lstm/.gitkeep
 create mode 100644 models/lstm/data/.gitkeep
 create mode 100644 models/lstm/models/.gitkeep
 rewrite models/lstm/notebooks/data_preprocessing.ipynb (93%)
 create mode 100644 models/lstm/results/.gitkeep
Enumerating objects: 39, done.
Counting objects: 100% (39/39), done.
Delta compression using up to 2 threads
Compressing objects: 100% (31/31), done.
Writing objects: 100% (38/38), 576.22 KiB | 2.28 MiB/s, done.
Total 38 (delta 6), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (6/6), done.[K
remote: [1;31merror[m: GH013: Repository rule violations found for refs/heads/main.[K
remote: 
remote: - GITHUB PUSH PROTECTION[K
remote:   —————————————————————————————————————————[K
remote:     Resolve the following violations before pushing again[K
remote: 
remote:     - 