In [None]:
import os
import glob
import re
import pandas as pd
from IPython.display import display

# Concatenate all ETF csv into one master data frame

In [None]:
# Reading csv files from ETF_data folder
csv_files = glob.glob(os.path.join("ETF_data", "*.csv"))
df_list = []

# For each csv file extract the ETF name using regex and add the name to the a new column called "ETF"
# Add each csv file to df_list
for file in csv_files:
    df = pd.read_csv(file)
    match = re.search(r"/([^/]+)\.csv$", file)
    if match:
        ETF_str = match.group(1)
        df["ETF"] = ETF_str
    df_list.append(df)

# Remove any empty data frames
df_list = [df for df in df_list if not df.empty]

# Combine all data frames into one master data frame
master_df = pd.concat(df_list, ignore_index=True)
master_df.dropna(inplace=True)
master_df

# Checking for null and duplicate values

In [None]:
display(master_df.isnull().sum())
# True indicates that there is no dupe values as the number of Falses should be equivalent to the size of the data frame
print(master_df.duplicated().to_list().count(False) == len(master_df))

# Outlier detection & filtration

In [None]:
# Calculate IQR for the 'Volume' column as an example:
# Outputtig the outlier count for the first 5 ETFs
for etf in master_df["ETF"].unique()[0:5]:
    
    q1 = master_df[master_df["ETF"] == etf]['Volume'].quantile(0.25)
    q3 = master_df[master_df["ETF"] == etf]['Volume'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    mask = (master_df["ETF"] == etf) & (
    (master_df["Volume"] < lower_bound) | (master_df["Volume"] > upper_bound))
    outliers = master_df.loc[mask]
    
    # Identify outliers:
    print(f"Outliers count for {etf}:", outliers.shape[0])

In [None]:
# Option: Filter out volume outliers
q1 = master_df.groupby('ETF')['Volume'].transform(lambda x: x.quantile(0.25))
q3 = master_df.groupby('ETF')['Volume'].transform(lambda x: x.quantile(0.75))
iqr = q3 - q1

# Define the lower and upper bounds for each row
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Create a mask that is True when Volume is within the acceptable range
mask = (master_df['Volume'] >= lower_bound) & (master_df['Volume'] <= upper_bound)

# Filter master_df
df_filtered_alt = master_df[mask].copy()

print("Original rows:", len(master_df))
print("Rows after filtering out volume outliers:", len(df_filtered_alt))

In [None]:
master_df

In [None]:
master_df.to_pickle("master_df.pkl")