# 📊 Data Preprocessing

In [1]:
# 📊 Import necessary libraries
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns  
import os  
import sys

In [2]:
# 🛠️ Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [3]:
# 📚 Importing the DataPreprocessor class from the scripts directory
from data_preprocessing import DataPreprocessor 

In [4]:
# 🛠️ Create an instance of DataPreprocessor
processor = DataPreprocessor()

In [5]:
# 📅 Define the start and end dates for data retrieval
start_date = "2015-01-01"  # Start date = January-01-2015
end_date = "2025-01-31"    # End date = January-31-2025

# 📈 Define the symbols for which we want to fetch historical data
symbols = ["BND", "SPY", "TSLA"]  # Bond, S&P 500, and Tesla stocks

In [6]:
# 🚀 Fetch the data using the get_data method
data_paths = processor.get_data(start_date, end_date, symbols)

# 📂 Output the paths of the saved data files
print("=====================================================")
print("📂 Data files saved at the following locations:")
print("=====================================================")
for symbol, path in data_paths.items():
    print(f"{symbol}: {path}")

📊 Fetching data for BND from 2015-01-01 to 2025-01-31...
✅ Data for BND saved to '../data/BND.csv'.
📊 Fetching data for SPY from 2015-01-01 to 2025-01-31...
✅ Data for SPY saved to '../data/SPY.csv'.
📊 Fetching data for TSLA from 2015-01-01 to 2025-01-31...
✅ Data for TSLA saved to '../data/TSLA.csv'.


📂 Data files saved at the following locations:
BND: ../data/BND.csv
SPY: ../data/SPY.csv
TSLA: ../data/TSLA.csv


## **📥 Load Data**

In [7]:
# 📥 Load BND data
bnd = processor.load_data('BND')
print("📊 BND data loaded successfully!")

# 📥 Load SPY data
spy = processor.load_data('SPY')
print("📊 SPY data loaded successfully!")

# 📥 Load TSLA data
tsla = processor.load_data('TSLA')
print("📊 TSLA data loaded successfully!")


📊 Loading data for BND from '../data/BND.csv'.
📊 Loading data for SPY from '../data/SPY.csv'.
📊 Loading data for TSLA from '../data/TSLA.csv'.


📊 BND data loaded successfully!
📊 SPY data loaded successfully!
📊 TSLA data loaded successfully!


## **Explore the Data**

In [8]:
# 🌟 Explore the first few rows of BND data
print("🔍 Here are the first 10 rows of the BND data:")
display(bnd.head(10))

🔍 Here are the first 10 rows of the BND data:


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-01-31,72.48,72.54,72.2,72.34,6738335.0
2025-01-30,72.46,72.53,72.38,72.44,5622434.0
2025-01-29,72.43,72.49,72.17,72.34,5780349.0
2025-01-28,72.39,72.39,72.21,72.38,4424518.0
2025-01-27,72.34,72.44,72.26,72.42,8621175.0
2025-01-24,72.0,72.0943,71.8696,72.04,5555722.0
2025-01-23,72.0,72.0,71.8,71.9,7529793.0
2025-01-22,72.15,72.2,71.97,72.01,6616809.0
2025-01-21,72.21,72.21,72.0725,72.16,8491622.0
2025-01-17,72.1,72.1,71.91,71.95,5600397.0


In [17]:
# 📊 Understand the data of BND
print("====================================================")
print("     🔍 Inspecting the data for BND:")
print("====================================================")

inspection_results = processor.inspect_data(bnd)  
display(pd.DataFrame(inspection_results))

📋 Data inspection results:
{'data_types': Open      float64
High      float64
Low       float64
Close     float64
Volume    float64
dtype: object, 'missing_values': Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64, 'duplicate_rows': np.int64(0)}


     🔍 Inspecting the data for BND:


Unnamed: 0,data_types,missing_values,duplicate_rows
Open,float64,0,0
High,float64,0,0
Low,float64,0,0
Close,float64,0,0
Volume,float64,0,0


In [13]:
# 🌟 Explore the first few rows of SPY data
print("🔍 Here are the first 10 rows of the SPY data:")
display(spy.head(10))

🔍 Here are the first 10 rows of the SPY data:


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-01-31,607.5,609.96,601.05,601.82,66671471.0
2025-01-30,603.96,606.6,600.715,605.04,39281255.0
2025-01-29,603.72,604.13,599.22,601.81,37177429.0
2025-01-28,600.62,605.37,597.25,604.52,44433322.0
2025-01-27,594.81,599.69,594.64,599.37,70361125.0
2025-01-24,609.81,610.78,606.8,607.97,34604693.0
2025-01-23,605.8,609.75,605.52,609.75,41152102.0
2025-01-22,605.92,607.82,605.36,606.44,48195973.0
2025-01-21,600.67,603.06,598.67,603.05,42532853.0
2025-01-17,596.96,599.36,595.61,597.58,58070628.0


In [18]:
# 📊 Understand the data of SPY
print("====================================================")
print("     🔍 Inspecting the data for SPY:")
print("====================================================")

inspection_results = processor.inspect_data(spy)  
display(pd.DataFrame(inspection_results))

📋 Data inspection results:
{'data_types': Open      float64
High      float64
Low       float64
Close     float64
Volume    float64
dtype: object, 'missing_values': Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64, 'duplicate_rows': np.int64(0)}


     🔍 Inspecting the data for SPY:


Unnamed: 0,data_types,missing_values,duplicate_rows
Open,float64,0,0
High,float64,0,0
Low,float64,0,0
Close,float64,0,0
Volume,float64,0,0


In [19]:
# 🌟 Explore the first few rows of TSLA data
print("🔍 Here are the first 10 rows of the TSLA data:")
display(tsla.head(10))

🔍 Here are the first 10 rows of the TSLA data:


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-01-31,401.53,419.99,401.34,404.6,83568219
2025-01-30,410.78,412.5,384.41,400.28,98092879
2025-01-29,395.21,398.5899,384.48,389.1,68033648
2025-01-28,396.91,400.59,386.5,398.09,48910676
2025-01-27,394.8,406.69,389.0,397.15,58125510
2025-01-24,414.45,418.88,405.78,406.58,56427149
2025-01-23,416.06,420.73,408.95,412.38,50690592
2025-01-22,416.81,428.0,414.59,415.11,60963342
2025-01-21,432.64,433.2,406.31,424.07,87320894
2025-01-17,421.5,439.74,419.75,426.5,94991429


In [20]:
# 📊 Understand the data of TSLA
print("====================================================")
print("     🔍 Inspecting the data for TSLA:")
print("====================================================")

inspection_results = processor.inspect_data(tsla)  
display(pd.DataFrame(inspection_results))

📋 Data inspection results:
{'data_types': Open      float64
High      float64
Low       float64
Close     float64
Volume      int64
dtype: object, 'missing_values': Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64, 'duplicate_rows': np.int64(0)}


     🔍 Inspecting the data for TSLA:


Unnamed: 0,data_types,missing_values,duplicate_rows
Open,float64,0,0
High,float64,0,0
Low,float64,0,0
Close,float64,0,0
Volume,int64,0,0


In [21]:
# 📐 Shape of the data for each stock
print("📊 Shape of the data:")
print(f"📈 TSLA: {tsla.shape}")  
print(f"💼 BND: {bnd.shape}")    
print(f"📉 SPY: {spy.shape}") 

📊 Shape of the data:
📈 TSLA: (2536, 5)
💼 BND: (2536, 5)
📉 SPY: (2536, 5)
