In [1]:
import wrds
import pandas as pd
from datetime import datetime

def fetch_prices_for_dates(cusip_list, date_list):
    print("Fetching")
    cusip_list_str = "', '".join(cusip_list)
    date_list_str = "', '".join(pd.to_datetime(date_list).strftime('%Y-%m-%d'))
    
    query = f"""
        SELECT 
            cusip,
            datadate,
            ajexdi,
            prccd,
            trfd
        FROM 
            comp_na_daily_all.secd
        WHERE 
            cusip IN ('{cusip_list_str}') AND
            datadate IN ('{date_list_str}')
    """
    return db.raw_sql(query)

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def first_non_na(row, columns):
    for col in columns:
        if pd.notna(row[col]):
            return row[col]
    return None

def downcast_numeric_columns(df):
    """
    Downcast numerical columns in a pandas DataFrame to reduce memory usage.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame to downcast.
    
    Returns:
    pd.DataFrame: DataFrame with downcasted numeric columns.
    """
    # Downcast integer columns
    int_cols = df.select_dtypes(include=['int', 'int64']).columns
    df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')
    
    # Downcast float columns
    float_cols = df.select_dtypes(include=['float', 'float64']).columns
    df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    
    return df


# Establish a connection to the WRDS database
db = wrds.Connection(wrds_username='asherbaraban')

Loading library list...
Done


In [2]:
# Fetch insiders data
insiders_df = db.raw_sql(f"""
    SELECT
        dcn,
        seqnum,
        personid,
        owner,
        rolecode1,
        rolecode2,
        rolecode3,
        rolecode4,
        cname,
        ticker,
        sector,
        industry,
        trandate,
        tprice,
        ownership,
        cleanse,
        acqdisp,
        CONCAT(cusip6, cusip2) AS cusipI
    FROM tr_insiders.table1
    WHERE 
        formtype = '4' AND
        cleanse IN ('R', 'H') AND
        trancode IS NOT NULL AND
        acqdisp IS NOT NULL AND
        cusip6 IS NOT NULL AND
        cusip2 IS NOT NULL AND
        trandate <= '2023-12-31'
""")
insiders_df

Unnamed: 0,dcn,seqnum,personid,owner,rolecode1,rolecode2,rolecode3,rolecode4,cname,ticker,sector,industry,trandate,tprice,ownership,cleanse,acqdisp,cusipi
0,001249601,1.0,12086811.0,MAY EARLE C,D,,,,WEBFINANCIAL CORP,,01,02,2000-10-06,3.88,I,H,D,94767P20
1,001249601,2.0,12086811.0,MAY EARLE C,D,,,,WEBFINANCIAL CORP,,01,02,2000-10-19,3.88,I,H,D,94767P20
2,001249601,3.0,12086811.0,MAY EARLE C,D,,,,WEBFINANCIAL CORP,,01,02,2000-10-20,3.88,I,H,D,94767P20
3,001502679,4.0,12314118.0,IVY MADIE,B,,,,ESG RE LTD,ESREF,01,05,2000-12-04,2.13,I,R,A,G3121510
4,001502679,5.0,12314118.0,IVY MADIE,B,,,,ESG RE LTD,ESREF,01,05,2000-12-05,2.50,I,R,A,G3121510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216403,se00294559,1.0,16311594.0,KAZARIAN CAMILLE,CFO,O,EVP,,SUMMIT STATE BANK,SSBI,01,04,2023-04-03,13.85,D,H,A,86626420
216404,se00294560,1.0,16307338.0,DEL SECCO GENIE A,CO,O,EVP,,SUMMIT STATE BANK,SSBI,01,04,2023-04-03,13.85,D,H,A,86626420
216405,se00294561,1.0,16363634.0,CASTLIO MICHAEL J,O,EVP,,,SUMMIT STATE BANK,SSBI,01,04,2023-04-03,13.85,D,H,A,86626420
216406,se00294722,1.0,16296226.0,CORRIGAN MARGARET A,B,,,,HINGHAM INSTITUTION FOR SAVINGS,HIFS,01,04,2022-04-19,345.70,I,H,A,43332310


In [3]:
insiders_df['trandate'] = pd.to_datetime(insiders_df['trandate'])
object_cols = ['owner', 'rolecode1', 'rolecode2', 'rolecode3', 'rolecode4', 'cname', 'ticker', 'sector', 'ownership', 'cleanse', 'acqdisp', 'cusipi']
for col in object_cols:
    insiders_df[col] = insiders_df[col].astype('category')
insiders_df['dcn'] = insiders_df['dcn'].astype('string')
insiders_df['seqnum'] = pd.to_numeric(insiders_df['seqnum'], downcast='integer')
insiders_df['personid'] = pd.to_numeric(insiders_df['personid'], downcast='integer')

insiders_df.to_pickle("raw_insiders_pre_merge.pkl")

In [4]:
insiders_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8716408 entries, 0 to 216407
Data columns (total 18 columns):
 #   Column     Dtype         
---  ------     -----         
 0   dcn        string        
 1   seqnum     int16         
 2   personid   int32         
 3   owner      category      
 4   rolecode1  category      
 5   rolecode2  category      
 6   rolecode3  category      
 7   rolecode4  category      
 8   cname      category      
 9   ticker     category      
 10  sector     category      
 11  industry   object        
 12  trandate   datetime64[ns]
 13  tprice     float64       
 14  ownership  category      
 15  cleanse    category      
 16  acqdisp    category      
 17  cusipi     category      
dtypes: category(12), datetime64[ns](1), float64(1), int16(1), int32(1), object(1), string(1)
memory usage: 545.3+ MB
