In [1]:
import wrds
import pandas as pd
from sklearn.linear_model import LinearRegression  # Import LinearRegression

db = wrds.Connection(wrds_username='asherbaraban')

Loading library list...
Done


In [2]:
investor = {
    'VFINX': 31432,
    'VEXMX': 31433,
    'NAESX': 31460,
    'VEURX': 31337,
    'VPACX': 31336,
    'VBINX': 31227,
    'VEIEX': 31338,
    'VIMSX': 31473,
    'VISGX': 31471,
    'VISVX': 31468,
    'VIVAX': 31435
}

admiral = {
    'VBIAX': 31228,
    'VPADX': 31342,
    'VEUSX': 31346,
    'VIMAX': 31454,
    'VEMAX': 31349,
    'VVIAX': 31457,
    'VSMAX': 31458,
    'VEXAX': 31459,
    'VSGAX': 53228,
    'VFIAX': 31461,
    'VSIAX': 53229
}

crsp_fundnos = set(admiral.values()).union(investor.values())
# ADD QQQ
crsp_fundnos.add(24697)
# ADD REIT INDEX VGSLX
crsp_fundnos.add(31193)
# ADD VTMGX 
crsp_fundnos.add(31266)
placeholders = ', '.join(str(int(num)) for num in crsp_fundnos)

df = db.raw_sql(f"""
        SELECT 
            a.crsp_fundno,
            a.caldt,
            b.mret,
            fees.exp_ratio,
            names.fund_name,
            header.ticker
        FROM crsp_q_mutualfunds.monthly_nav a
        LEFT JOIN crsp_q_mutualfunds.monthly_returns b 
            ON a.crsp_fundno = b.crsp_fundno AND a.caldt = b.caldt
        LEFT JOIN crsp_q_mutualfunds.fund_names names
            ON a.crsp_fundno = names.crsp_fundno AND a.caldt BETWEEN names.chgdt AND names.chgenddt
        LEFT JOIN crsp_q_mutualfunds.fund_hdr header
            ON a.crsp_fundno = header.crsp_fundno
        LEFT JOIN crsp_q_mutualfunds.fund_fees fees
            ON a.crsp_fundno = fees.crsp_fundno AND a.caldt BETWEEN fees.begdt AND fees.enddt
        WHERE
            a.caldt >= '1976-08-01' AND a.caldt <= '2024-01-01'
            AND a.crsp_fundno IN ({placeholders})
    """)

expected_tickers = set(investor.keys()).union(admiral.keys())
expected_tickers.add('QQQ')
expected_tickers.add('VGSLX')
expected_tickers.add('VTMGX')
# Extract unique tickers from the DataFrame
unique_tickers = df['ticker'].unique()

# Assert that all expected tickers are present in the unique tickers from the DataFrame
missing_tickers = set(expected_tickers) - set(unique_tickers)
assert not missing_tickers, f"Missing tickers: {missing_tickers}"

extra_tickers = set(unique_tickers) - set(expected_tickers)
assert not extra_tickers, f"Extra tickers: {missing_tickers}"

print("All expected tickers are present in the DataFrame.")

df['crsp_fundno'] = df['crsp_fundno'].astype(int)
df['caldt'] = pd.to_datetime(df['caldt'])
df['month'] = df['caldt'].dt.month
df['year'] = df['caldt'].dt.year
df['ticker'] = df.groupby('crsp_fundno')['ticker'].ffill().bfill()
df['fund_name'] = df.groupby('crsp_fundno')['fund_name'].ffill().bfill()
df['exp_ratio'] = df.groupby('crsp_fundno')['exp_ratio'].ffill().bfill()
df['mret_gross'] = df['mret'] + df['exp_ratio'] / 12

All expected tickers are present in the DataFrame.


In [3]:
# Create a composite key
df['composite_key'] = df['crsp_fundno'].astype(str) + '-' + df['year'].astype(str) + '-' + df['month'].astype(str)

# Check for duplicates
duplicate_rows = df[df.duplicated('composite_key', keep=False)]

# If duplicate_rows is empty, then crsp_fundno, year, and month uniquely identify each row
if duplicate_rows.empty:
    print("crsp_fundno, year, and month uniquely identify each row.")
else:
    print(f"Found {len(duplicate_rows)} duplicate rows based on crsp_fundno, year, and month:")
    print(duplicate_rows)

# Alternatively, you can use an assertion
assert duplicate_rows.empty, f"Found duplicate rows based on crsp_fundno, year, and month: {duplicate_rows}"

crsp_fundno, year, and month uniquely identify each row.


In [4]:
# Assuming df is already defined and contains the required data
# List of expected tickers
expected_tickers = ['VFINX', 'VEXMX', 'NAESX', 'VEURX', 'VPACX', 'VVIAX', 'VBINX', 'VEIEX', 'VIMSX', 'VISGX', 'VISVX', 'QQQ', 'VGSLX']

# Group by ticker and count missing `mret` values
missing_mret_counts = df.groupby('ticker')['mret'].apply(lambda x: x.isna().sum())

# Display the counts
print(missing_mret_counts)

ticker
NAESX    0
QQQ      1
VBIAX    1
VBINX    1
VEIEX    1
VEMAX    1
VEURX    1
VEUSX    1
VEXAX    1
VEXMX    1
VFIAX    1
VFINX    1
VGSLX    1
VIMAX    1
VIMSX    1
VISGX    1
VISVX    1
VIVAX    1
VPACX    1
VPADX    1
VSGAX    1
VSIAX    1
VSMAX    1
VTMGX    1
VVIAX    1
Name: mret, dtype: int64


In [5]:
# Define ticker mappings
investor_to_admiral = {
    'VFINX': 'VFIAX',
    'VEXMX': 'VEXAX',
    'NAESX': 'VSMAX',
    'VEURX': 'VEUSX',
    'VPACX': 'VPADX',
    'VBINX': 'VBIAX',
    'VEIEX': 'VEMAX',
    'VIMSX': 'VIMAX',
    'VISGX': 'VSGAX',
    'VISVX': 'VSIAX',
    'VIVAX': 'VVIAX'
}

investor_to_admiral = {
    "VFINX": "VFIAX",
    "VGTSX": "VTMGX",
    "VDAIX": "VDADX",
    "VEIEX": "VEMAX",
    "VEURX": "VEUSX",
    "VEXMX": "VEXAX",
    "VFWIX": "VFWAX",
    "VFSVX": "VFSAX",
    "VFTSX": "VFTAX",
    "VGXRX": "VGRLX",
    "VIGRX": "VIGAX",
    "VHDYX": "VHYAX",
    "VIDMX": "VIAAX",
    "VIHIX": "VIHAX",
    "VLACX": "VLCAX",
    "VMGIX": "VMGMX",
    "VIMSX": "VIMAX",
    "VMVIX": "VMVAX",
    "VPACX": "VPADX",
    "VISGX": "VSGAX",
    "NAESX": "VSMAX",
    "VISVX": "VSIAX",
    "VTMSX": "VTMSX",
    "VTSMX": "VTSAX",
    "VTWSX": "VTWAX",
    "VIVAX": "VVIAX"
}
# Replace investor returns with admiral returns
for investor, admiral in investor_to_admiral.items():
    print(investor, admiral)
    # Create masks for investor and admiral shares
    investor_mask = df['ticker'] == investor
    admiral_mask = df['ticker'] == admiral

    # Align the returns based on the dates
    investor_returns = df[investor_mask].set_index('caldt')['mret_gross']
    admiral_returns = df[admiral_mask].set_index('caldt')['mret_gross']

    # Update the investor returns with admiral returns where available
    aligned_investor, aligned_admiral = investor_returns.align(admiral_returns, join='left')
    aligned_investor.update(aligned_admiral.dropna())

    # Update the original DataFrame
    df.loc[investor_mask, 'mret_gross'] = df.loc[investor_mask, 'caldt'].map(aligned_investor)

# Drop the admiral shares rows from the DataFrame
df = df[~df['ticker'].isin(investor_to_admiral.values())]

df

VFINX VFIAX
VGTSX VTMGX
VDAIX VDADX
VEIEX VEMAX
VEURX VEUSX
VEXMX VEXAX
VFWIX VFWAX
VFSVX VFSAX
VFTSX VFTAX
VGXRX VGRLX
VIGRX VIGAX
VHDYX VHYAX
VIDMX VIAAX
VIHIX VIHAX
VLACX VLCAX
VMGIX VMGMX
VIMSX VIMAX
VMVIX VMVAX
VPACX VPADX
VISGX VSGAX
NAESX VSMAX
VISVX VSIAX
VTMSX VTMSX
VTSMX VTSAX
VTWSX VTWAX
VIVAX VVIAX


Unnamed: 0,crsp_fundno,caldt,mret,exp_ratio,fund_name,ticker,month,year,mret_gross,composite_key
0,24697,1999-03-31,,0.0018,Nasdaq-100 Trust;1,QQQ,3,1999,,24697-1999-3
1,24697,1999-04-30,0.014148,0.0018,Nasdaq-100 Trust;1,QQQ,4,1999,0.014298,24697-1999-4
2,24697,1999-05-28,-0.021852,0.0018,Nasdaq-100 Trust;1,QQQ,5,1999,-0.021702,24697-1999-5
3,24697,1999-06-30,0.098838,0.0018,Nasdaq-100 Trust;1,QQQ,6,1999,0.098988,24697-1999-6
4,24697,1999-07-30,-0.011376,0.0018,Nasdaq-100 Trust;1,QQQ,7,1999,-0.011226,24697-1999-7
...,...,...,...,...,...,...,...,...,...,...
7704,31473,2016-05-31,0.018641,0.0018,Vanguard Index Funds: Vanguard Mid-Cap Index F...,VIMSX,5,2016,0.018538,31473-2016-5
7705,31473,2016-06-30,-0.000946,0.0018,Vanguard Index Funds: Vanguard Mid-Cap Index F...,VIMSX,6,2016,-0.000635,31473-2016-6
7706,31473,2016-07-29,0.046250,0.0018,Vanguard Index Funds: Vanguard Mid-Cap Index F...,VIMSX,7,2016,0.046452,31473-2016-7
7707,31473,2016-08-31,0.001417,0.0018,Vanguard Index Funds: Vanguard Mid-Cap Index F...,VIMSX,8,2016,0.001611,31473-2016-8


In [6]:
df = df.dropna(subset=['mret_gross'])
df = df[~((df['ticker'] == 'NAESX') & (df['caldt'] < '1990-01-01'))]

df['ticker'].unique()

array(['QQQ', 'VGSLX', 'VBINX', 'VBIAX', 'VPACX', 'VEURX', 'VEIEX',
       'VFINX', 'VEXMX', 'VIVAX', 'NAESX', 'VISVX', 'VISGX', 'VIMSX'],
      dtype=object)

In [7]:
df = df.sort_values(by=['ticker', 'caldt'])

# Get the inception date for each fund
inception_dates = df.groupby('ticker')['caldt'].min().reset_index()
inception_dates.columns = ['ticker', 'inception_date']
inception_dates = inception_dates.sort_values(by='inception_date')
sorted_funds = inception_dates['ticker'].tolist()
sorted_funds
inception_dates

Unnamed: 0,ticker,inception_date
7,VFINX,1976-09-30
6,VEXMX,1988-01-29
0,NAESX,1990-01-31
5,VEURX,1990-07-31
13,VPACX,1990-07-31
3,VBINX,1992-10-30
12,VIVAX,1992-12-31
4,VEIEX,1994-06-30
9,VIMSX,1998-06-30
10,VISGX,1998-06-30


In [8]:
orthogonal_basis = pd.DataFrame(index=df['caldt'].unique())

# Loop over each fund
for i, fund in enumerate(sorted_funds):
    fund_data = df[df['ticker'] == fund].set_index('caldt')['mret_gross']
    # Initialize the regression model
    if i > 0:
        valid_index = fund_data.dropna().index
        X = orthogonal_basis.loc[valid_index, orthogonal_basis.columns[:i]].values
        y = fund_data.dropna().values
        print(y.shape)
        # Fit the model
        reg = LinearRegression(fit_intercept=False)
        reg.fit(X, y)
        residuals = y - reg.predict(X)

        # Create a Series to align with the orthogonal_basis index
        residuals_series = pd.Series(data=residuals, index=valid_index)

        # Calculate the residuals
        orthogonal_basis[fund] = 0.0  # Initialize with zeros

        orthogonal_basis.loc[valid_index, fund] = residuals_series
    else:
        orthogonal_basis[fund] = fund_data.reindex(orthogonal_basis.index)

(432,)
(408,)
(402,)
(402,)
(375,)
(373,)
(355,)
(307,)
(307,)
(307,)
(297,)
(277,)
(265,)


In [9]:
path = 'mutual_fund_data/vanguard_orthogonal_basis.csv'
orthogonal_basis.to_csv(path, index=True)