In [1]:
import wrds
import pandas as pd
from datetime import datetime

def fetch_prices_for_dates(cusip_list, date_list):
    print("Fetching")
    cusip_list_str = "', '".join(cusip_list)
    date_list_str = "', '".join(pd.to_datetime(date_list).strftime('%Y-%m-%d'))
    
    query = f"""
        SELECT 
            cusip,
            datadate,
            ajexdi,
            prccd,
            trfd,
            secstat
        FROM 
            comp_na_daily_all.secd
        WHERE 
            cusip IN ('{cusip_list_str}') AND
            datadate IN ('{date_list_str}')
    """
    return db.raw_sql(query)

def fetch_CRSP(cusip_list, date_list):
    print("Fetching")
    cusip_list_str = "', '".join(cusip_list)
    date_list_str = "', '".join(pd.to_datetime(date_list).strftime('%Y-%m-%d'))
    
    query = f"""
        SELECT 
            Cusip,
            date,
            prc,
            cfacshr,
            cfacpr
        FROM 
            crsp_a_stock.dsf
        WHERE 
            Cusip IN ('{cusip_list_str}') AND
            date IN ('{date_list_str}')
    """
    return db.raw_sql(query)


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def first_non_na(row, columns):
    for col in columns:
        if pd.notna(row[col]):
            return row[col]
    return None

def downcast_numeric_columns(df):
    """
    Downcast numerical columns in a pandas DataFrame to reduce memory usage.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame to downcast.
    
    Returns:
    pd.DataFrame: DataFrame with downcasted numeric columns.
    """
    # Downcast integer columns
    int_cols = df.select_dtypes(include=['int', 'int64']).columns
    df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')
    
    # Downcast float columns
    float_cols = df.select_dtypes(include=['float', 'float64']).columns
    df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    
    return df


# Establish a connection to the WRDS database
db = wrds.Connection(wrds_username='asherbaraban')

Loading library list...
Done


In [3]:
insiders_df = pd.read_pickle("raw_insiders_pre_merge.pkl")
insiders_df['trandate'] = pd.to_datetime(insiders_df['trandate'])
insiders_df['trandate_6mo'] = insiders_df['trandate'] + pd.DateOffset(months=6)
insiders_df['trandate_6mo_1'] = insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=1)
insiders_df['trandate_6mo_2'] = insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=2)
insiders_df['trandate_6mo'] = pd.to_datetime(insiders_df['trandate_6mo'])
insiders_df['trandate_6mo_1'] = pd.to_datetime(insiders_df['trandate_6mo_1'])
insiders_df['trandate_6mo_2'] = pd.to_datetime(insiders_df['trandate_6mo_2'])
insiders_df = downcast_numeric_columns(insiders_df)

object_cols = ['owner', 'rolecode1', 'rolecode2', 'rolecode3', 'rolecode4', 'cname', 'ticker', 'sector', 'ownership', 'cleanse', 'acqdisp', 'cusipi']
for col in object_cols:
    insiders_df[col] = insiders_df[col].astype('category')
insiders_df['dcn'] = insiders_df['dcn'].astype('string')
insiders_df['seqnum'] = pd.to_numeric(insiders_df['seqnum'], downcast='integer')
insiders_df['personid'] = pd.to_numeric(insiders_df['personid'], downcast='integer')

db = wrds.Connection(wrds_username='asherbaraban')
sector_etfs = {
    1: "XLF",  # Finance
    2: "XLV",  # Healthcare
    3: "IYK",  # Consumer Non-Durable
    4: "XLY",  # Consumer Services
    5: "XHB",  # Consumer Durables
    6: "XLE",  # Energy
    7: "IYT",  # Transportation
    8: "XLK",  # Technology
    9: "XLB",  # Basic Industries
    10: "XLI",  # Capital Goods
    11: "XLU",  # Public Utilities
    99: "VTI",  # Miscellaneous
}

benchmark_tickers = sector_etfs.values()
tickers_placeholder = ', '.join(f"'{ticker}'" for ticker in benchmark_tickers)

query = f"""
        SELECT 
            Cusip,
            datadate,
            ajexdi,
            trfd,
            prccd,
            tic
        FROM 
            comp_na_daily_all.secd
        WHERE 
            tic IN ({tickers_placeholder}) AND
            datadate = '2023-10-02'
    """
prices = db.raw_sql(query)

ticker_to_cusip = prices.set_index('tic')['cusip'].to_dict()
# Generate the additional dates
trandate_list = insiders_df['trandate'].unique().tolist()
trandate_6mo_list = (insiders_df['trandate'] + pd.DateOffset(months=6)).unique().tolist()
trandate_6mo_1d_list = (insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=1)).unique().tolist()
trandate_6mo_2d_list = (insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=2)).unique().tolist()

# Combine all date lists
combined_date_list = list(set(trandate_list + trandate_6mo_list + trandate_6mo_1d_list + trandate_6mo_2d_list))

# Now query the right dates to get total returns for all of the right intervals 
# etf_prices = fetch_prices_for_dates(ticker_to_cusip.values(), combined_date_list)

first_8_digits_cusips = [cusip[:8] for cusip in ticker_to_cusip.values()]

etf_prices = fetch_CRSP(first_8_digits_cusips, combined_date_list)
etf_prices['datadate'] = pd.to_datetime(etf_prices['datadate'])
insiders_df['sector_ticker'] = insiders_df['sector'].map(sector_etfs)
insiders_df['sector_cusip'] = insiders_df['sector_ticker'].map(ticker_to_cusip)
etf_prices['cusip'] = etf_prices['cusip'].astype('category')
etf_prices.dropna(subset=['datadate'], inplace=True)
# etf_prices['trfd'].fillna(1, inplace=True)
# etf_prices['ajexdi'].fillna(1, inplace=True)
etf_prices['price_adj'] = (etf_prices['prccd'] * etf_prices['trfd']) / etf_prices['ajexdi']
etf_prices.drop(columns = ['prccd', 'trfd', 'ajexdi', 'secstat'], inplace=True)
# etf_prices.to_pickle("etf_prices.pkl")

Loading library list...
Done
Fetching


KeyError: 'datadate'

In [None]:
dir(db)


In [None]:
db.get_table(library='crsp_a_stock', table='dsf', obs=100)

In [4]:
etf_prices['price_adj'] = (etf_prices['prc'] * etf_prices['cfacshr']) / etf_prices['cfacpr']

Unnamed: 0,cusip,date,prc,cfacshr,cfacpr
0,81369Y10,1998-12-22,20.82813,1.0,1.0
1,81369Y10,1998-12-23,21.04688,1.0,1.0
2,81369Y10,1998-12-24,21.53125,1.0,1.0
3,81369Y10,1998-12-28,21.34375,1.0,1.0
4,81369Y10,1998-12-29,21.73438,1.0,1.0
...,...,...,...,...,...
71563,78464A88,2023-11-08,76.93000,1.0,1.0
71564,78464A88,2023-11-09,75.94000,1.0,1.0
71565,78464A88,2023-11-10,77.52000,1.0,1.0
71566,78464A88,2023-11-13,77.23000,1.0,1.0


In [None]:
ticker_to_cusip.values()

In [5]:
compustat = pd.read_pickle("etf_prices.pkl")

In [6]:
compustat

Unnamed: 0,cusip,datadate,price_adj
0,81369Y100,1998-12-22,20.828125
1,81369Y100,1998-12-23,21.046875
2,81369Y100,1998-12-24,21.531250
3,81369Y100,1998-12-28,21.343750
4,81369Y100,1998-12-29,21.734375
...,...,...,...
73627,78464A888,2024-07-26,139.931256
73628,78464A888,2024-07-29,140.529816
73629,78464A888,2024-07-30,141.463569
73630,78464A888,2024-07-31,141.487511
