In [6]:
import os, sys
import pandas as pd
import numpy as np
import glob
from tqdm.auto import tqdm
import wrds
from multiprocessing import Pool
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds

# Identify files

In [7]:
# Main directory
main_folder = '../../../../GitHub/High-Freq-ML'

# Get list of CRSP files
crsp_price_folder = f'{main_folder}/data/crsp/daily/'
crsp_price_files = glob.glob(crsp_price_folder + '*.parquet')
crsp_price_files_dates = [x.split('/')[-1].split('.')[0] for x in crsp_price_files]

# Main

In [8]:
def get_crsp_me(date_header):
    # Gets market equity at end of day using CRSP files
    
    # Load data
    filenames = [x for x in crsp_price_files if f'/{date_header}' in x]
    crsp_df = pd.concat((pd.read_parquet(x) for x in filenames), ignore_index = True)

    # Clean
    crsp_df['date'] = pd.to_datetime(crsp_df['date'], format = '%Y%m%d')
    crsp_df['me'] = crsp_df['prc'].abs()*crsp_df['shrout']#*crsp_df['cfacshr']/crsp_df['cfacpr']
    crsp_df['meq'] = crsp_df.groupby(["permco", "date"])["me"].transform("sum")
    crsp_df["me_day_max_permco"] =  crsp_df.groupby(["permco", "date"])["me"].transform("max")
    crsp_df["permno"] = pd.Categorical(crsp_df['permno'].astype(str))
    crsp_df["is_primary"] = np.where(crsp_df['me_day_max_permco'] == crsp_df['me'], 1, 0).astype(bool)

    return crsp_df[['date', 'permno', 'me', 'meq', 'is_primary']].set_index('date')

In [10]:
%%time
crsp_price_files_yyyymm = np.unique([x[:6] for x in crsp_price_files_dates])
df_list = []

# Parallel process - get ME closing for all days and permnos
with Pool(12) as p:
    for df in tqdm(p.imap_unordered(get_crsp_me, crsp_price_files_yyyymm), 
                   total = len(crsp_price_files_yyyymm)):
        df_list.append(df)

  0%|          | 0/1140 [00:00<?, ?it/s]

CPU times: user 9.75 s, sys: 10.3 s, total: 20.1 s
Wall time: 21.9 s


In [11]:
%%time
# Concatenate all days of files
crsp_me_df = pd.concat(df_list)

CPU times: user 1.97 s, sys: 2.55 s, total: 4.52 s
Wall time: 4.54 s


In [12]:
%%time
# Add lagged ME 
crsp_me_df = crsp_me_df.sort_values(by = 'date', kind = 'mergesort')
crsp_me_df[['me_close_lag', 'meq_close_lag']] = crsp_me_df.groupby(['permno'])[['me', 'meq']].shift(1)

CPU times: user 10.1 s, sys: 12.4 s, total: 22.5 s
Wall time: 22.6 s


In [18]:
%%time
# Save
crsp_me_df.loc['1995-01-01':].reset_index().to_feather(f'{main_folder}/data/keys/crsp_me.feather')

CPU times: user 6.94 s, sys: 4.91 s, total: 11.8 s
Wall time: 8.2 s
