In [45]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction')
from config import *

import yfinance as yf
import pandas as pd

### **Get historical data for S&P 500**

In [6]:


def fetch_stock_data(symbol='^GSPC', end='2024-12-31'):
   """Fetch all available historical data until end date"""
   df = yf.Ticker(symbol).history(period='max', end=end)
   df.to_csv(f"{RAW_DATA_PATH}/{symbol}_data.csv")
   print(f"Data saved to {RAW_DATA_PATH}/{symbol}_data.csv")
   print(f"Date range: {df.index[0]} to {df.index[-1]}")
   print(f"Shape: {df.shape}")
   return df

# Fetch data
raw_data = fetch_stock_data()

Data saved to /content/drive/MyDrive/Colab Notebooks/Stock_Market_Prediction/models/lstm/data/raw/^GSPC_data.csv
Date range: 1927-12-30 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
Shape: (24366, 7)


### **Load saved raw data**

In [7]:
from bokeh.plotting import figure, show, output_notebook
output_notebook()

def plot_timeseries(df, title='S&P 500 Historical Prices', x_col='Date', y_col='Close'):
   output_notebook()

   p = figure(width=800, height=400, x_axis_type='datetime', title=title)
   p.line(df.index, df[y_col], line_width=2)

   p.xaxis.axis_label = x_col
   p.yaxis.axis_label = y_col
   p.grid.grid_line_alpha = 0.3

   show(p)

In [8]:

raw_data = pd.read_csv(f"{RAW_DATA_PATH}/^GSPC_data.csv", index_col='Date', parse_dates=True)
print(f"Loaded data shape: {raw_data.shape}\nDate range: {raw_data.index[0]} to {raw_data.index[-1]}")
print(raw_data.tail())

df=raw_data.copy()

Loaded data shape: (24366, 7)
Date range: 1927-12-30 00:00:00-05:00 to 2024-12-30 00:00:00-05:00
                                  Open         High          Low        Close  \
Date                                                                            
2024-12-23 00:00:00-05:00  5940.250000  5978.250000  5902.569824  5974.069824   
2024-12-24 00:00:00-05:00  5984.629883  6040.100098  5981.439941  6040.040039   
2024-12-26 00:00:00-05:00  6024.970215  6049.750000  6007.370117  6037.589844   
2024-12-27 00:00:00-05:00  6006.169922  6006.169922  5932.950195  5970.839844   
2024-12-30 00:00:00-05:00  5920.669922  5940.790039  5869.160156  5906.939941   

                               Volume  Dividends  Stock Splits  
Date                                                            
2024-12-23 00:00:00-05:00  3593280000        0.0           0.0  
2024-12-24 00:00:00-05:00  1757720000        0.0           0.0  
2024-12-26 00:00:00-05:00  2904530000        0.0           0.0  
2024-12-27

##**Raw S&P500 data chart**

In [9]:
plot_timeseries(df)

## **Check data types**

In [10]:
df.dtypes

Unnamed: 0,0
Open,float64
High,float64
Low,float64
Close,float64
Volume,int64
Dividends,float64
Stock Splits,float64


##**Function to split data into train,validation and test**

In [42]:
def split_data(df, train_size=0.7, val_size=0.15):
    n = len(df)
    train_end = int(n * train_size)
    val_end = int(n * (train_size + val_size))

    train = df[:train_end]
    val = df[train_end:val_end]
    test = df[val_end:]

    return train, val, test

## **Function to save preprocessed data and coresponding MinMaxScaler (train,val,test)**

In [46]:
def save_preprocessed_data(train, val, test, scaler):
    joblib.dump(scaler, os.path.join(SCALERS_PATH, 'minmax_scaler.joblib'))
    train.to_csv(os.path.join(TRAIN_PATH, 'scaled_data.csv'))
    val.to_csv(os.path.join(VAL_PATH, 'scaled_data.csv'))
    test.to_csv(os.path.join(TEST_PATH, 'scaled_data.csv'))

## **Normalize data using MinMaxScaler**

In [47]:
from sklearn.preprocessing import MinMaxScaler
import joblib

def preprocess_data(df):
    # 1. Create target
    df['Target'] = df['Close'].shift(-1)
    df.dropna(inplace=True)

    # 2. Split
    train, val, test = split_data(df)

    # 3. Fit scaler on train data
    scaler = MinMaxScaler()
    columns = ['Close', 'Target']
    scaler.fit(train[columns])

    # 4. Scale all sets
    train_scaled = pd.DataFrame(scaler.transform(train[columns]),
                              columns=columns, index=train.index)
    val_scaled = pd.DataFrame(scaler.transform(val[columns]),
                            columns=columns, index=val.index)
    test_scaled = pd.DataFrame(scaler.transform(test[columns]),
                             columns=columns, index=test.index)

    # Save scaler and data
    save_preprocessed_data(train_scaled, val_scaled, test_scaled, scaler)

    return train_scaled, val_scaled, test_scaled, scaler

In [52]:
train_scaled, val_scaled, test_scaled,scaler=preprocess_data(df)

## **Plot normalized data sets**

In [61]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.palettes import Set2_3

def plot_splits(train, val, test, title='Stock Price Data Splits'):
   output_notebook()
   p = figure(width=800, height=400,
             x_axis_type='datetime',
             title=title)

   p.line(train.index, train['Close'], color='black', legend_label='Train', line_width=2)
   p.line(val.index, val['Close'], color='orange', legend_label='Validation', line_width=2)
   p.line(test.index, test['Close'], color='blue', legend_label='Test', line_width=2)

   p.legend.location = "top_left"
   p.xaxis.axis_label = 'Date'
   p.yaxis.axis_label = 'Scaled Close Price'

   show(p)

# Plot the splits
plot_splits(train_scaled, val_scaled, test_scaled,title='Stock Price Data Splits - Scaled')

## **Function to inverz transform normalized data back to it's real price**

In [62]:
import joblib

scaler = joblib.load(os.path.join(SCALERS_PATH, 'minmax_scaler.joblib'))

def inverse_transform(scaled_data, scaler):
   columns = ['Close', 'Target']
   real_values = pd.DataFrame(
       scaler.inverse_transform(scaled_data[columns]),
       columns=columns,
       index=scaled_data.index
   )
   return real_values

train_real = inverse_transform(train_scaled, scaler)
val_real = inverse_transform(val_scaled, scaler)
test_real = inverse_transform(test_scaled, scaler)
plot_splits(train_real, val_real, test_real,title='Stock Price Data Splits - Real')