In [None]:
import wrds
import pandas as pds

# Establish a connection to the WRDS database
db = wrds.Connection(wrds_username='asherbaraban')

In [None]:
# Fetch insiders data
insiders_df = db.raw_sql(f"""
    SELECT
        dcn,
        seqnum,
        personid,
        owner,
        rolecode1,
        rolecode2,
        rolecode3,
        rolecode4,
        cname,
        ticker,
        sector,
        industry,
        trandate,
        tprice,
        ownership,
        cleanse,
        acqdisp,
        CONCAT(cusip6, cusip2, cusipx) AS cusipI
    FROM tr_insiders.table1
    WHERE 
        formtype = '4' AND
        cleanse IN ('R', 'H') AND
        trancode IS NOT NULL AND
        acqdisp IS NOT NULL AND
        cusip6 IS NOT NULL AND
        cusip2 IS NOT NULL AND
        cusipx IS NOT NULL
""")
insiders_df

In [None]:
original_df = insiders_df.copy()
insiders_df['trandate'] = pd.to_datetime(insiders_df['trandate'])
object_cols = ['owner', 'rolecode1', 'rolecode2', 'rolecode3', 'rolecode4', 'cname', 'ticker', 'sector', 'ownership', 'cleanse', 'acqdisp', 'cusipi']
for col in object_cols:
    insiders_df[col] = insiders_df[col].astype('category')
insiders_df['dcn'] = insiders_df['dcn'].astype('string')
insiders_df['seqnum'] = pd.to_numeric(insiders_df['seqnum'], downcast='integer')
insiders_df['personid'] = pd.to_numeric(insiders_df['personid'], downcast='integer')

insiders_df.to_csv("raw_insiders_pre_merge.csv", index=False)


comparison = original_df.compare(insiders_df)

# Check for differences
if comparison.empty:
    print("No information loss detected in all columns.")
else:
    print("Information loss detected:")
    print(comparison)

In [None]:
insiders_df.info()

In [None]:
insiders_df = pd.read_csv("raw_insiders_pre_merge.csv")
insiders_df['trandate'] = pd.to_datetime(insiders_df['trandate'])
object_cols = ['owner', 'rolecode1', 'rolecode2', 'rolecode3', 'rolecode4', 'cname', 'ticker', 'sector', 'ownership', 'cleanse', 'acqdisp', 'cusipi']
for col in object_cols:
    insiders_df[col] = insiders_df[col].astype('category')
insiders_df['dcn'] = insiders_df['dcn'].astype('string')
insiders_df['seqnum'] = pd.to_numeric(insiders_df['seqnum'], downcast='integer')
insiders_df['personid'] = pd.to_numeric(insiders_df['personid'], downcast='integer')

In [None]:
from datetime import datetime

def fetch_prices_for_date(cusip_list, date):
    cusip_list_str = "', '".join(cusip_list)
    date_str = pd.to_datetime(date).strftime('%Y-%m-%d')
    query = f"""
        SELECT 
            cusip,
            datadate,
            prccd,
            ajexdi,
            trfd
        FROM 
            comp_na_daily_all.secd
        WHERE 
            cusip IN ('{cusip_list_str}') AND
            datadate = '{date_str}'
    """
    return db.raw_sql(query)

def fetch_prices_for_dates(cusip_list, date_list):
    print("Fetching")
    cusip_list_str = "', '".join(cusip_list)
    date_list_str = "', '".join(pd.to_datetime(date_list).strftime('%Y-%m-%d'))
    
    query = f"""
        SELECT 
            cusip,
            datadate,
            prccd,
            ajexdi,
            trfd
        FROM 
            comp_na_daily_all.secd
        WHERE 
            cusip IN ('{cusip_list_str}') AND
            datadate IN ('{date_list_str}')
    """
    return db.raw_sql(query)

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

cusip_list = insiders_df['cusipi'].unique().tolist()
trandate_list = insiders_df['trandate'].unique().tolist()


# Generate the additional dates
trandate_6mo_list = (insiders_df['trandate'] + pd.DateOffset(months=6)).unique().tolist()
trandate_6mo_1d_list = (insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=1)).unique().tolist()
trandate_6mo_2d_list = (insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=2)).unique().tolist()

# Combine all date lists
combined_date_list = list(set(trandate_list + trandate_6mo_list + trandate_6mo_1d_list + trandate_6mo_2d_list))
print(len(combined_date_list))
start_time = datetime.now()
print(start_time)
final_prices_df = pd.DataFrame()
date_chunks = list(chunks(combined_date_list, 3000))

for i, date_chunk in enumerate(date_chunks):
    chunk_start_time = datetime.now()
    print(f"Processing chunk {i+1}/{len(date_chunks)}")

    chunk_prices_df = fetch_prices_for_dates(cusip_list, date_chunk)
    chunk_prices_df['datadate'] = pd.to_datetime(chunk_prices_df['datadate'])
    final_prices_df = pd.concat([final_prices_df, chunk_prices_df], ignore_index=True)

    chunk_end_time = datetime.now()
    print(f"Chunk {i+1} processed in: {chunk_end_time - chunk_start_time}")

end_time = datetime.now()
print(f"Data fetched in: {end_time - start_time}")

# Verify the concatenated DataFrame
print(f"Total rows fetched: {len(final_prices_df)}")
print(final_prices_df.head())

In [None]:
final_prices_df.to_csv("raw_prices.csv", index=True)

In [None]:
final_prices_df

In [None]:
insiders_df['trandate_6mo'] = insiders_df['trandate'] + pd.DateOffset(months=6)
insiders_df['trandate_6mo_1'] = insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=1)
insiders_df['trandate_6mo_2'] = insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=2)

In [None]:
insiders_df = insiders_df.merge(final_prices_df,
                                left_on=['cusipi', 'trandate'],
                                right_on=['cusip', 'datadate'],
                                how='left')

In [None]:
insiders_df = insiders_df.merge(final_prices_df,
                                left_on=['cusipi', 'trandate_6mo'],
                                right_on=['cusip', 'datadate'],
                                how='left',
                                suffixes=('', '_6mo'))
insiders_df = insiders_df.merge(final_prices_df,
                                left_on=['cusipi', 'trandate_6mo_1'],
                                right_on=['cusip', 'datadate'],
                                how='left',
                                suffixes=('', '_6mo_1'))
insiders_df = insiders_df.merge(final_prices_df,
                                left_on=['cusipi', 'trandate_6mo_2'],
                                right_on=['cusip', 'datadate'],
                                how='left',
                                suffixes=('', '_6mo_2'))

insiders_df.info()

In [None]:
def first_non_na(row, columns):
    for col in columns:
        if pd.notna(row[col]):
            return row[col]
    return None

# List of columns to check in order of priority
price_columns = ['prccd_6mo', 'prccd_6mo_1', 'prccd_6mo_2']
adjustment_columns = ['ajexdi_6mo', 'ajexdi_6mo_1', 'ajexdi_6mo_2']
total_return_columns = ['trfd_6mo', 'trfd_6mo_1', 'trfd_6mo_2']

# Apply the function to each row to get the first non-NA value
insiders_df['prccd_6mo_consolidated'] = insiders_df.apply(lambda row: first_non_na(row, price_columns), axis=1)
insiders_df['ajexdi_6mo_consolidated'] = insiders_df.apply(lambda row: first_non_na(row, adjustment_columns), axis=1)
insiders_df['trfd_6mo_consolidated'] = insiders_df.apply(lambda row: first_non_na(row, total_return_columns), axis=1)

In [None]:
insiders_df['prccd_adjusted_begin'] = insiders_df['prccd'] / insiders_df['ajexdi']
insiders_df['prccd_adjusted_end'] = insiders_df['prccd_6mo_consolidated'] / insiders_df['ajexdi_6mo_consolidated']

insiders_df['total_return_6mo'] = (
    (insiders_df['prccd_adjusted_end'] * insiders_df['trfd_6mo_consolidated']) /
    (insiders_df['prccd_adjusted_begin'] * insiders_df['trfd'])
) - 1

insiders_df.head()
insiders_df.to_csv("merged_prices_insiders.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
# Assuming insiders_df is already defined and has the column 'trandate'
insiders_df['trandate_year'] = insiders_df['trandate'].dt.year

# Group by the extracted year and calculate the mean total return
annual_return = insiders_df.groupby(['trandate_year', 'acqdisp'])['total_return_6mo'].mean().reset_index()
annual_return = annual_return[annual_return['acqdisp'] == 'A']
# Create a line plot of the average total return by year
plt.figure(figsize=(10, 6))
plt.plot(annual_return['trandate_year'], annual_return['total_return_6mo'], marker='o', linestyle='-')
plt.xlabel('Year')
plt.ylabel('Average Total Return (6 Months)')
plt.title('Average 6-Month Total Return by Year')
plt.grid(False)
plt.show()


In [None]:
insiders_df['industry'].unique()

In [None]:
num_all_na = insiders_df[['cusip', 'cusip_6mo', 'cusip_6mo_1', 'cusip_6mo_2']].isna().all(axis=1).sum()

In [None]:
num_all_na

In [None]:
insiders_df.shape

In [None]:

# Assuming insiders_df is already defined
# You can find the rows where all specified columns are NA
na_rows = insiders_df[insiders_df[['cusip', 'cusip_6mo', 'cusip_6mo_1', 'cusip_6mo_2']].isna().all(axis=1)]

# Display the rows
print(na_rows)

In [None]:
na_rows[['cusipi', 'ticker', 'trandate', 'cname']]

In [None]:
na_rows['trandate_year'] = na_rows['trandate'].dt.year

# Calculate the distribution of years
year_distribution = na_rows['trandate_year'].value_counts().sort_index() 
year_distribution.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Distribution of Years for trandate')
plt.show()

In [None]:
def fetch_prices(cusip_list):
    print("Fetching")
    cusip_list_str = "', '".join(cusip_list)
    # date_list_str = "', '".join(pd.to_datetime(date_list).strftime('%Y-%m-%d'))
    
    query = f"""
        SELECT 
            cusip,
            datadate,
            prccd,
            ajexdi,
            trfd,
            tic,
            conm
        FROM 
            comp_na_daily_all.secd
        WHERE 
            tic IN ('{cusip_list_str}')
    """
    return db.raw_sql(query)

x = fetch_prices(["IFCR"])

In [None]:
x['datadate'].min()

In [None]:
insiders_df.columns