In [1]:
import wrds
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression  # Import LinearRegression

db = wrds.Connection(wrds_username='asherbaraban')

df = db.raw_sql("""
    SELECT 
        a.crsp_fundno,
        a.caldt,
        b.mret,
        fees.exp_ratio,
        names.fund_name,
        header.ticker
    FROM crsp_q_mutualfunds.monthly_nav a
    LEFT JOIN crsp_q_mutualfunds.monthly_returns b 
        ON a.crsp_fundno = b.crsp_fundno AND a.caldt = b.caldt
    LEFT JOIN crsp_q_mutualfunds.fund_names names
        ON a.crsp_fundno = names.crsp_fundno AND a.caldt BETWEEN names.chgdt AND names.chgenddt
    LEFT JOIN crsp_q_mutualfunds.fund_hdr header
        ON a.crsp_fundno = header.crsp_fundno
    LEFT JOIN crsp_q_mutualfunds.fund_fees fees
        ON a.crsp_fundno = fees.crsp_fundno AND a.caldt BETWEEN fees.begdt AND fees.enddt
    WHERE
        a.caldt >= '1976-08-01' AND a.caldt <= '2024-01-01' AND names.fund_name LIKE '%%Vanguard%%'
""")

ticker_mapping_investor_to_admiral = {
    "VFINX": "VFIAX",
    "VGTSX": "VTMGX",
    "VDAIX": "VDADX",
    "VEIEX": "VEMAX",
    "VEURX": "VEUSX",
    "VEXMX": "VEXAX",
    "VFWIX": "VFWAX",
    "VFSVX": "VFSAX",
    "VFTSX": "VFTAX",
    "VGXRX": "VGRLX",
    "VIGRX": "VIGAX",
    "VHDYX": "VHYAX",
    "VIDMX": "VIAAX",
    "VIHIX": "VIHAX",
    "VLACX": "VLCAX",
    "VMGIX": "VMGMX",
    "VIMSX": "VIMAX",
    "VMVIX": "VMVAX",
    "VPACX": "VPADX",
    "VISGX": "VSGAX",
    "NAESX": "VSMAX",
    "VISVX": "VSIAX",
    "VMCIX": "VTCLX",
    "VTMSX": "VTMSX",
    "VTSMX": "VTSAX",
    "VTWSX": "VTWAX",
    "VIVAX": "VVIAX",
    "VBINX": "VBIAX"
}

ticker_list = [ticker for pair in ticker_mapping_investor_to_admiral.items() for ticker in pair]

# Filter the DataFrame to include only those tickers
filtered_df = df[df['ticker'].isin(ticker_list)].copy()

Loading library list...
Done


In [2]:
crsp_fundnos_to_query = tuple(filtered_df['crsp_fundno'].unique())

# Construct the SQL query string
query = f"""
    SELECT 
        a.crsp_fundno,
        a.caldt,
        b.mret,
        fees.exp_ratio,
        names.fund_name,
        header.ticker
    FROM crsp_q_mutualfunds.monthly_nav a
    LEFT JOIN crsp_q_mutualfunds.monthly_returns b 
        ON a.crsp_fundno = b.crsp_fundno AND a.caldt = b.caldt
    LEFT JOIN crsp_q_mutualfunds.fund_names names
        ON a.crsp_fundno = names.crsp_fundno AND a.caldt BETWEEN names.chgdt AND names.chgenddt
    LEFT JOIN crsp_q_mutualfunds.fund_hdr header
        ON a.crsp_fundno = header.crsp_fundno
    LEFT JOIN crsp_q_mutualfunds.fund_fees fees
        ON a.crsp_fundno = fees.crsp_fundno AND a.caldt BETWEEN fees.begdt AND fees.enddt
    WHERE
        a.caldt >= '1976-08-01' AND a.caldt <= '2024-01-01'
        AND a.crsp_fundno IN {crsp_fundnos_to_query}
"""

# Query the database to get the data
# Replace `db` with your database connection object
df = db.raw_sql(query)
df['caldt'] = pd.to_datetime(df['caldt'])
df['exp_ratio'] = df.groupby('crsp_fundno')['exp_ratio'].ffill().bfill()
df['mret_gross'] = df['mret'] + df['exp_ratio'] / 12

# Ensure df is sorted by date
df.sort_values(by='caldt', inplace=True)

df = df.dropna(subset=['mret'])
df = df[~((df['ticker'] == 'NAESX') & (df['caldt'] < '1990-01-01'))]

df['ticker'].unique()

df = df.sort_values(by=['ticker', 'caldt'])

# Get the inception date for each fund
inception_dates = df.groupby('ticker')['caldt'].min().reset_index()
inception_dates.columns = ['ticker', 'inception_date']
inception_dates = inception_dates.sort_values(by='inception_date')
sorted_funds = inception_dates['ticker'].tolist()
sorted_funds
inception_dates

Unnamed: 0,ticker,inception_date
11,VFINX,1976-09-30
9,VEXMX,1988-01-29
0,NAESX,1990-01-31
35,VPACX,1990-07-31
6,VEURX,1990-07-31
44,VTSMX,1992-05-29
2,VBINX,1992-10-30
27,VIVAX,1992-12-31
21,VIGRX,1992-12-31
4,VEIEX,1994-06-30


In [3]:
orthogonal_basis = pd.DataFrame(index=df['caldt'].unique())
spanning_info = {}

# Loop over each fund
for i, fund in enumerate(sorted_funds):
    fund_data = df[df['ticker'] == fund].set_index('caldt')['mret_gross']
    # Initialize the regression model
    if i > 0:
        valid_index = fund_data.dropna().index
        X = orthogonal_basis.loc[valid_index, orthogonal_basis.columns[:i]].values
        y = fund_data.dropna().values
        print(y.shape)
        # Fit the model
        reg = LinearRegression(fit_intercept=False)
        reg.fit(X, y)
        residuals = y - reg.predict(X)
        print(fund, np.var(residuals))
        # Check if residuals are close to zero
        if np.isclose(np.var(residuals), 0, atol=5e-5):
            spanning_info[fund] = True
            continue  # Skip the funded fund
        else:
            spanning_info[fund] = False

        # Create a Series to align with the orthogonal_basis index
        residuals_series = pd.Series(data=residuals, index=valid_index)

        # Calculate the residuals
        orthogonal_basis[fund] = 0.0  # Initialize with zeros

        orthogonal_basis.loc[valid_index, fund] = residuals_series
    else:
        orthogonal_basis[fund] = fund_data.reindex(orthogonal_basis.index)
        spanning_info[fund] = False  # The first fund can't be spanned by any other funds

(432,)
VEXMX 0.0006907374182635423
(408,)
NAESX 0.0001457659298243907
(402,)
VPACX 0.001661925689491612
(402,)
VEURX 0.0006780703704763599
(380,)
VTSMX 2.0760674177541564e-06
(375,)
VBINX 2.215823935477106e-05
(373,)
VIVAX 0.00017617056858296493
(373,)
VIGRX 0.0001802063394311776
(355,)
VEIEX 0.001257256284030839
(464,)
VTCLX 2.5724702227514568e-05
(332,)
VGTSX 1.231520579815324e-05
(307,)
VMCIX 0.00017208963337392544
(307,)
VISVX 0.0005195735329489057
(307,)
VISGX 0.0002214040549269428
(307,)
VIMSX 0.00013429436960063064
(297,)
VTMSX 0.00010366955728680505
(292,)
VTMGX 3.4322083497326984e-06
(164,)
VIDMX 1.92298649176394e-05
(277,)
VTSAX 4.4314167271284494e-07
(277,)
VSMAX 3.443724126136911e-05
(277,)
VVIAX 7.943273328022194e-05
(277,)
VBIAX 2.2712762132129215e-05
(277,)
VEXAX 2.5116811166505927e-08
(277,)
VFIAX 1.876975289008769e-09
(277,)
VIGAX 8.28371348787868e-05
(268,)
VEUSX 3.098914281749434e-08
(268,)
VPADX 1.651164255056592e-07
(265,)
VIMAX 6.360345747237523e-05
(239,)
VLACX 5

In [4]:
len(orthogonal_basis.columns)

6

In [5]:
orthogonal_basis.columns

Index(['VFINX', 'VEXMX', 'VPACX', 'VEURX', 'VEIEX', 'VISVX'], dtype='object')

In [6]:
path = 'mutual_fund_data/vanguard_orthogonal_basis_spanning.csv'
orthogonal_basis.to_csv(path, index=True)