In [1]:
# Import necessary libraries
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller

In [2]:

# Define ticker symbols and date range
tickers = ["TSLA", "BND", "SPY"]
start_date = "2015-07-01"
end_date = "2025-07-31"

# Create a dictionary to store dataframes for each ticker
all_data = {}

# Fetch data for each ticker
for ticker in tickers:
    print(f"Fetching data for {ticker}...")
    df = yf.download(ticker, start=start_date, end=end_date)
    all_data[ticker] = df
    # Save raw data to the specified path
    df.to_csv(f'../data/raw/{ticker}_raw.csv')
    print(f"Data for {ticker} fetched and saved to data/raw/{ticker}_raw.csv")

# Display the first few rows of one of the dataframes to check
print("\nSample of TSLA data:")
print(all_data['TSLA'].head())

Fetching data for TSLA...


  df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start_date, end=end_date)


Data for TSLA fetched and saved to data/raw/TSLA_raw.csv
Fetching data for BND...


[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start_date, end=end_date)


Data for BND fetched and saved to data/raw/BND_raw.csv
Fetching data for SPY...


[*********************100%***********************]  1 of 1 completed

Data for SPY fetched and saved to data/raw/SPY_raw.csv

Sample of TSLA data:
Price           Close       High        Low       Open     Volume
Ticker           TSLA       TSLA       TSLA       TSLA       TSLA
Date                                                             
2015-07-01  17.943333  18.174667  17.856667  18.073999   31518000
2015-07-02  18.667999  18.830000  18.220667  18.680000  107458500
2015-07-06  18.648001  18.779333  18.420000  18.591999   61828500
2015-07-07  17.858667  18.346666  17.384666  18.333332   91576500
2015-07-08  16.997334  17.386667  16.954000  17.288000   93316500





In [3]:
#check information fo data 
print("\nInformation about TSLA data:")
print(all_data['TSLA'].info())


Information about TSLA data:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2535 entries, 2015-07-01 to 2025-07-30
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (Close, TSLA)   2535 non-null   float64
 1   (High, TSLA)    2535 non-null   float64
 2   (Low, TSLA)     2535 non-null   float64
 3   (Open, TSLA)    2535 non-null   float64
 4   (Volume, TSLA)  2535 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 118.8 KB
None


In [3]:
# -----------------------------------------------------------------------------
# Subtask 1.2: Data Cleaning and Understanding
# -----------------------------------------------------------------------------
# Assuming raw data is already saved in the data/raw folder
tickers = ["TSLA", "BND", "SPY"]
all_data = {}

# Load the raw data for each ticker from the CSV files
for ticker in tickers:
    try:
        df = pd.read_csv(f'../data/raw/{ticker}_raw.csv', index_col=0, parse_dates=True)
        all_data[ticker] = df
        print(f"Loaded raw data for {ticker} from file.")
    except FileNotFoundError:
        print(f"Error: Raw data file for {ticker} not found. Please ensure it's in the data/raw directory.")
        # Skip this ticker if the file is not found
        continue

# Check the column structure of the loaded data
if 'TSLA' in all_data:
    print("\nSample TSLA data info:")
    all_data['TSLA'].info()

# Create a single DataFrame for 'Close' prices for easier manipulation
# This will be the primary data for many analyses.
processed_data = pd.DataFrame()

# Iterate through the loaded data and extract the relevant price series
for ticker, df in all_data.items():
    # Check if the DataFrame has a MultiIndex and select the 'Close' column accordingly
    if isinstance(df.columns, pd.MultiIndex):
        # The adjusted close price is in the 'Close' column within the MultiIndex
        adj_close_series = df.loc[:, 'Close']
    else:
        # If it's a simple index, select the 'Close' column
        adj_close_series = df['Close']

    # Rename the column to the ticker symbol and add to the processed DataFrame
    processed_data[ticker] = adj_close_series



# Display basic statistics to understand the data distribution
print("\nBasic statistics for processed data:")
print(processed_data.describe())



Loaded raw data for TSLA from file.
Loaded raw data for BND from file.
Loaded raw data for SPY from file.

Sample TSLA data info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2535 entries, 2015-07-01 to 2025-07-30
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Price   2535 non-null   float64
 1   Close   2535 non-null   float64
 2   High    2535 non-null   float64
 3   Low     2535 non-null   float64
 4   Open    2535 non-null   int64  
 5   Volume  0 non-null      float64
dtypes: float64(5), int64(1)
memory usage: 138.6 KB

Basic statistics for processed data:
              TSLA          BND          SPY
count  2535.000000  2535.000000  2535.000000
mean    134.960429    68.571174   335.981618
std     123.783736     4.563590   127.053089
min      10.331333    60.899393   156.926746
25%      19.279667    64.731230   234.024909
50%      97.666664    67.749353   306.143457
75%     240.881668    72.018947   423.114882


In [4]:
# Check for and handle missing values, which are common in financial data
print("\nMissing values before handling:")
print(processed_data.isnull().sum())


Missing values before handling:
TSLA    0
BND     0
SPY     0
dtype: int64


In [5]:
# Use forward fill, then backward fill to ensure no NaNs remain
processed_data.fillna(method='ffill', inplace=True)
processed_data.fillna(method='bfill', inplace=True)

  processed_data.fillna(method='ffill', inplace=True)
  processed_data.fillna(method='bfill', inplace=True)


In [6]:
print("\nMissing values after handling:")
print(processed_data.isnull().sum())


Missing values after handling:
TSLA    0
BND     0
SPY     0
dtype: int64


In [7]:

# Verify data types for all columns
print("\nData types:")
print(processed_data.info())

# Save the processed data to the designated directory
processed_data.to_csv('../data/processed/all_assets_processed.csv')
print("\nProcessed data saved to data/processed/all_assets_processed.csv")


Data types:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2535 entries, 2015-07-01 to 2025-07-30
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TSLA    2535 non-null   float64
 1   BND     2535 non-null   float64
 2   SPY     2535 non-null   float64
dtypes: float64(3)
memory usage: 79.2 KB
None

Processed data saved to data/processed/all_assets_processed.csv
