In [1]:
import wrds
import pandas as pd
from datetime import datetime

# Function to execute the query for each chunk
def execute_chunk_query(values_list):
    query = f"""
        WITH cusip_dates AS (
          VALUES
            {values_list}
        )
        SELECT 
          cd.column1 AS cusip, 
          cd.column2 AS trandate,
          cd.column2 + INTERVAL '6 months' AS end_date,
          COALESCE(MIN(d.date), cd.column2) AS first_date, 
          COALESCE(MAX(d.date), cd.column2 + INTERVAL '6 months') AS last_date,
          COUNT(d.ret) AS n_periods,
          SUM(CASE WHEN d.ret IS NULL THEN 1 ELSE 0 END) AS n_miss,
          COALESCE(MIN(d.ret), -99) AS min_ret,
          COALESCE(MAX(d.ret), -99) AS max_ret,
          COALESCE(EXP(SUM(LOG(1 + d.ret))) - 1, -99) AS cum_return,
          d.permno
        FROM cusip_dates cd
        LEFT JOIN crspq.dsf d
          ON d.cusip = cd.column1
          AND d.date BETWEEN cd.column2 AND cd.column2 + INTERVAL '6 months'
        GROUP BY cd.column1, cd.column2, d.permno;
    """
    return db.raw_sql(query)

def construct_values_list(df_chunk):
    return ", ".join([f"('{row['cusipi']}', '{row['trandate']}'::date)" for _, row in df_chunk.iterrows()])

def downcast_numeric_columns(df):
    """
    Downcast numerical columns in a pandas DataFrame to reduce memory usage.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame to downcast.
    
    Returns:
    pd.DataFrame: DataFrame with downcasted numeric columns.
    """
    # Downcast integer columns
    int_cols = df.select_dtypes(include=['int', 'int64']).columns
    df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')
    
    # Downcast float columns
    float_cols = df.select_dtypes(include=['float', 'float64']).columns
    df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    
    return df


In [2]:
insiders_df = pd.read_pickle("raw_insiders_pre_merge.pkl")

insiders_df = insiders_df[['trandate', 'cusipi']]
insiders_df['cusipi'] = insiders_df['cusipi'].str[:8]
insiders_df = insiders_df.drop_duplicates()
print(insiders_df.shape)
chunk_size = 100000

final_prices_df = pd.DataFrame()
for start in range(0, len(insiders_df), chunk_size):
    chunk_start_time = datetime.now()
    db = wrds.Connection(wrds_username='asherbaraban')
    df_chunk = insiders_df[start:start + chunk_size]
    values_list = construct_values_list(df_chunk)
    
    chunk_df = execute_chunk_query(values_list)
    final_prices_df = pd.concat([final_prices_df, chunk_df], ignore_index=True)
    chunk_end_time = datetime.now()
    print(f"Chunk processed in: {chunk_end_time - chunk_start_time}")
    db.close()


(2305361, 2)
Loading library list...
Done
Chunk processed in: 0:05:03.059654
Loading library list...
Done
Chunk processed in: 0:04:51.664306
Loading library list...
Done
Chunk processed in: 0:04:29.444347
Loading library list...
Done
Chunk processed in: 0:05:53.402869
Loading library list...
Done
Chunk processed in: 0:03:49.559987
Loading library list...
Done
Chunk processed in: 0:04:14.896606
Loading library list...
Done
Chunk processed in: 0:03:41.791088
Loading library list...
Done
Chunk processed in: 0:04:29.757734
Loading library list...
Done
Chunk processed in: 0:04:33.212714
Loading library list...
Done
Chunk processed in: 0:04:13.990212
Loading library list...
Done
Chunk processed in: 0:03:58.788501
Loading library list...
Done
Chunk processed in: 0:04:11.294153
Loading library list...
Done
Chunk processed in: 0:04:14.494552
Loading library list...
Done
Chunk processed in: 0:03:32.047160
Loading library list...
Done
Chunk processed in: 0:03:46.066892
Loading library list...
Don

In [3]:
final_prices_df.to_pickle("raw_prices.pkl")

In [5]:
final_prices_df['permno'].isna().sum()

341748

In [6]:
(final_prices_df['max_ret'] == -99).sum()

347708

In [12]:
(final_prices_df['n_periods'] == 0 & ~final_prices_df['permno'].isna()).sum()

0

In [13]:
final_prices_df['n_periods'].describe()

count    2.305361e+06
mean     1.060385e+02
std      4.631915e+01
min      0.000000e+00
25%      1.250000e+02
50%      1.260000e+02
75%      1.280000e+02
max      1.300000e+02
Name: n_periods, dtype: float64