In [1]:
import pandas as pd
from IPython.display import display

# Load data frame

In [2]:
master_df = pd.read_pickle("master_df.pkl")
master_df

FileNotFoundError: [Errno 2] No such file or directory: 'master_df.pkl'

# Inspection of master EFT data frame features

In [None]:
print(master_df.shape[0])
print(master_df.columns.tolist())
print(len(etfList := master_df["ETF"].unique()))
display(master_df.head())
display(master_df.tail())

In [None]:
summary_stats = master_df.groupby('ETF').describe().round(2)
print("\nSummary Statistics by ETF:")
summary_stats.head()

# Univariate Analysis: Preview of one ETF

In [None]:
etf_key = "SPY"
df_sample = master_df[master_df["ETF"] == etf_key]

print("Preview of data for:", etf_key)
display(df_sample.head())
display(df_sample.describe())
print(df_sample.info())

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import random

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(df_sample['Date'], df_sample['Adj Close'])
plt.title(f"{etf_key} - Adjusted Close Price Over Time")
plt.xlabel("Date")
plt.ylabel("Adjusted Close")
ax = plt.gca()
ax.xaxis.set_major_locator(MaxNLocator(7))
plt.show()

In [None]:
# Example: Simple daily returns
df_sample.loc[:,'Daily_Return'] = df_sample.loc[:,'Adj Close'].pct_change()

# Check distribution of daily returns
print(df_sample['Daily_Return'].describe())

# Histogram of daily returns
plt.figure(figsize=(8, 4))
plt.hist(df_sample['Daily_Return'].dropna(), bins=50)
plt.title(f"{etf_key} - Distribution of Daily Returns")
plt.xlabel("Daily Return")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Create boxplot of SPY trading volume to visualise outliers
reduced_df = master_df.iloc[::100]
reduced_df_adj = reduced_df[reduced_df["ETF"] == etf_key]
plt.figure(figsize=(6, 10))
plt.boxplot(reduced_df_adj['Volume'].dropna())
plt.yscale('log')
plt.title("Boxplot of Trading Volume")
plt.ylabel("Volume")
plt.show()

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(30, 30))

for i in range(16):
    random.seed(i)
    randomInt = random.randint(0, len(etfList))
    
    row = i // 4
    col = i % 4

    df_etf = master_df[master_df['ETF'] == etfList[randomInt]].copy()
    axs[row, col].plot(df_etf['Date'], df_etf['Adj Close'])
    axs[row, col].set_xlabel(etfList[randomInt])
    axs[row, col].xaxis.set_major_locator(MaxNLocator(7))
    
# Tidy up layout so the plots don’t overlap
fig.supylabel("$/Unit")
fig.supxlabel("Date")
plt.tight_layout()
plt.show()

In [None]:
# Calculate daily returns using Adj Close for improved accuracy.
master_df['Daily Return'] = master_df['Adj Close'].pct_change()

# open price vs close price scatter plot
plt.figure(figsize=(8, 4))
plt.scatter(master_df['Open'], master_df['Close'], alpha=0.5)
plt.title("Open vs. Close Price")
plt.xlabel("Open Price")
plt.ylabel("Close Price")
plt.show()

# open & close correlation
print("Correlation (Open vs. Close):", master_df['Open'].corr(master_df['Close']))