In [1]:
import os
import glob
import re
import pandas as pd

In [2]:
# Reading csv files from ETF_data folder
csv_files = glob.glob(os.path.join("ETF_data", "*.csv"))
df_list = []

# For each csv file extract the ETF name using regex and add the name to the a new column called "ETF"
# Add each csv file to df_list
for file in csv_files:
    df = pd.read_csv(file)
    match = re.search(r"/([^/]+)\.csv$", file)
    if match:
        ETF_str = match.group(1)
        df["ETF"] = ETF_str
    df_list.append(df)

# Remove any empty data frames
df_list = [df for df in df_list if not df.empty]

# Combine all data frames into one master data frame
master_df = pd.concat(df_list, ignore_index=True)
master_df

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,ETF
0,2007-11-07,418.000000,418.000000,454.200012,384.000000,390.000000,595380.0,ANTE
1,2007-11-08,380.000000,380.000000,407.000000,370.000000,390.000000,162105.0,ANTE
2,2007-11-09,388.000000,388.000000,394.600006,364.000000,370.000000,84805.0,ANTE
3,2007-11-12,371.200012,371.200012,386.000000,360.000000,360.000000,62680.0,ANTE
4,2007-11-13,360.000000,360.000000,377.600006,360.000000,366.200012,90480.0,ANTE
...,...,...,...,...,...,...,...,...
18387217,2025-01-30,50.630001,50.630001,52.270000,48.650002,50.029999,4578200.0,NXT
18387218,2025-01-31,50.419998,50.419998,52.139999,50.395000,51.130001,2389700.0,NXT
18387219,2025-02-03,48.599998,48.599998,49.430000,47.750000,48.320000,2570700.0,NXT
18387220,2025-02-04,49.750000,49.750000,50.340000,47.660000,48.820000,2268300.0,NXT


In [3]:
# Data inspection
print(master_df.shape)
print(master_df.columns.tolist())
master_df.head()

(18387222, 8)
['Date', 'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume', 'ETF']


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,ETF
0,2007-11-07,418.0,418.0,454.200012,384.0,390.0,595380.0,ANTE
1,2007-11-08,380.0,380.0,407.0,370.0,390.0,162105.0,ANTE
2,2007-11-09,388.0,388.0,394.600006,364.0,370.0,84805.0,ANTE
3,2007-11-12,371.200012,371.200012,386.0,360.0,360.0,62680.0,ANTE
4,2007-11-13,360.0,360.0,377.600006,360.0,366.200012,90480.0,ANTE


In [4]:
summary_stats = master_df.groupby('ETF').describe().round(2)
print("\nSummary Statistics by ETF:")
summary_stats


Summary Statistics by ETF:


Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Close,Close,...,Open,Open,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
ETF,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A,6342.0,49.55,42.40,6.54,19.86,30.32,66.19,175.16,6342.0,52.50,...,69.55,179.28,6342.0,3312193.93,2389285.69,271900.0,1806678.75,2803874.0,4145699.25,62546380.0
AAA,1108.0,22.22,1.30,20.85,21.25,21.40,23.36,25.13,1108.0,24.85,...,25.06,25.40,1108.0,4402.20,10863.08,0.0,200.00,1050.0,4400.00,182900.0
AAAU,1628.0,18.14,3.51,11.74,16.31,18.05,19.45,28.31,1628.0,18.14,...,19.45,28.35,1628.0,940626.44,1174417.85,100.0,160450.00,512350.0,1385675.00,14338200.0
AADR,3660.0,42.41,12.19,21.78,32.88,42.35,52.06,75.32,3660.0,45.57,...,55.30,75.27,3660.0,8010.54,16666.03,0.0,600.00,2000.0,7300.00,323800.0
AAL,4871.0,23.29,14.76,1.66,11.06,18.07,36.37,59.35,4871.0,24.15,...,38.00,62.70,4871.0,16154541.47,21927577.85,138500.0,4950850.00,8385400.0,20216000.00,428617100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZVRA,2468.0,53.21,80.04,2.75,5.80,10.77,65.60,378.08,2468.0,53.21,...,64.80,380.64,2468.0,290184.59,1453731.93,0.0,4854.50,33887.5,256967.00,59212000.0
ZVSA,748.0,1096.41,1539.57,1.00,5.43,56.52,3458.00,7875.00,748.0,1096.41,...,3458.00,4900.00,748.0,224888.63,2751488.05,0.0,39.25,4356.0,38100.00,63497900.0
ZYBT,20.0,4.46,0.17,4.21,4.34,4.46,4.55,4.86,20.0,4.46,...,4.51,4.81,20.0,238109.55,434425.32,11691.0,21575.00,46600.0,307000.00,1864900.0
ZYME,1955.0,18.35,12.65,4.65,8.43,13.11,28.03,56.81,1955.0,18.35,...,27.81,57.31,1955.0,491170.53,1035755.84,500.0,146700.00,347600.0,584750.00,36641400.0


In [5]:
import matplotlib.pyplot as plt

In [None]:
etf_names = master_df['ETF'].unique()
for etf in etf_names:
    if (etf != "VOO"):
        continue;
    df_etf = master_df[master_df['ETF'] == etf].copy()
    df_etf.sort_values('Date', inplace=True)
    
    plt.figure(figsize=(8, 4))
    plt.plot(df_etf['Date'], df_etf['Adj Close'])
    plt.title(f"Closing Price Over Time - {etf}")
    plt.xlabel("Date")
    plt.ylabel("Adj Close Price")
    plt.tight_layout()
    if (etf == "VOO"):
        plt.show()