In [1]:
import wrds
import pandas as pd
from datetime import datetime
# import ray
# import os

# # Set environment variables for Modin and Ray
# os.environ["MODIN_ENGINE"] = "ray"
# os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1"
# # Look at the Ray documentation with respect to the Ray configuration suited to you most.
# ray.init()

def fetch_prices_for_dates(cusip_list, date_list):
    print("Fetching")
    cusip_list_str = "', '".join(cusip_list)
    date_list_str = "', '".join(pd.to_datetime(date_list).strftime('%Y-%m-%d'))
    
    query = f"""
        SELECT 
            cusip,
            datadate,
            ajexdi,
            prccd,
            trfd,
            secstat
        FROM 
            comp_na_daily_all.secd
        WHERE 
            cusip IN ('{cusip_list_str}') AND
            datadate IN ('{date_list_str}')
    """
    return db.raw_sql(query)

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def first_non_na(row, columns):
    for col in columns:
        if pd.notna(row[col]):
            return row[col]
    return None

def downcast_numeric_columns(df):
    """
    Downcast numerical columns in a pandas DataFrame to reduce memory usage.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame to downcast.
    
    Returns:
    pd.DataFrame: DataFrame with downcasted numeric columns.
    """
    # Downcast integer columns
    int_cols = df.select_dtypes(include=['int', 'int64']).columns
    df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')
    
    # Downcast float columns
    float_cols = df.select_dtypes(include=['float', 'float64']).columns
    df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    
    return df


# Establish a connection to the WRDS database
db = wrds.Connection(wrds_username='asherbaraban')

Loading library list...
Done


In [2]:
insiders_df = pd.read_pickle("raw_insiders_pre_merge.pkl")
cusip_list = insiders_df['cusipi'].unique().tolist()
trandate_list = insiders_df['trandate'].unique().tolist()

# Generate the additional dates
trandate_6mo_list = (insiders_df['trandate'] + pd.DateOffset(months=6)).unique().tolist()
trandate_6mo_1d_list = (insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=1)).unique().tolist()
trandate_6mo_2d_list = (insiders_df['trandate'] + pd.DateOffset(months=6) +  pd.DateOffset(days=2)).unique().tolist()

# Combine all date lists
combined_date_list = list(set(trandate_list + trandate_6mo_list + trandate_6mo_1d_list + trandate_6mo_2d_list))
print(len(combined_date_list))
start_time = datetime.now()
print(start_time)
final_prices_df = pd.DataFrame()
date_chunks = list(chunks(combined_date_list, 3000))

for i, date_chunk in enumerate(date_chunks):
    db = wrds.Connection(wrds_username='asherbaraban')
    chunk_start_time = datetime.now()
    print(f"Processing chunk {i+1}/{len(date_chunks)}")

    chunk_prices_df = fetch_prices_for_dates(cusip_list, date_chunk)
    chunk_prices_df['datadate'] = pd.to_datetime(chunk_prices_df['datadate'])
    final_prices_df = pd.concat([final_prices_df, chunk_prices_df], ignore_index=True)

    chunk_end_time = datetime.now()
    print(f"Chunk {i+1} processed in: {chunk_end_time - chunk_start_time}")
    db.close()

end_time = datetime.now()
print(f"Data fetched in: {end_time - start_time}")

# Verify the concatenated DataFrame
print(f"Total rows fetched: {len(final_prices_df)}")
print(final_prices_df.head())



14049
2024-09-09 11:52:08.449760
Loading library list...
Done
Processing chunk 1/5
Fetching
Chunk 1 processed in: 0:08:43.554274
Loading library list...
Done
Processing chunk 2/5
Fetching
Chunk 2 processed in: 0:07:08.980883
Loading library list...
Done
Processing chunk 3/5
Fetching
Chunk 3 processed in: 0:07:19.454416
Loading library list...
Done
Processing chunk 4/5
Fetching
Chunk 4 processed in: 0:08:35.530407
Loading library list...
Done
Processing chunk 5/5
Fetching
Chunk 5 processed in: 0:04:56.490418
Data fetched in: 0:36:55.030664
Total rows fetched: 69600435
       cusip   datadate     ajexdi   prccd      trfd secstat
0  000361105 1986-01-02   3.375000  26.125  1.049973       A
1  000781104 1986-01-02   6.000000  12.750  1.085128       I
2  000872309 1986-01-02   0.600000   2.688       NaN       I
3  000886309 1986-01-02  10.285711  21.250  1.000000       I
4  001030105 1986-01-02   1.000000  15.250       NaN       I


In [3]:
final_prices_df['trfd'].fillna(1, inplace=True)
# final_prices_df.dropna(subset=['prccd'], inplace=True)
final_prices_df['price_adj'] = (final_prices_df['prccd'] * final_prices_df['trfd']) / final_prices_df['ajexdi']
final_prices_df.drop(columns = ['prccd', 'trfd', 'ajexdi'], inplace=True)
final_prices_df['cusip'] = final_prices_df['cusip'].astype('category')
final_prices_df.to_pickle("raw_prices.pkl")

In [4]:
final_prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69600435 entries, 0 to 69600434
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   cusip      category      
 1   datadate   datetime64[ns]
 2   secstat    object        
 3   price_adj  float64       
dtypes: category(1), datetime64[ns](1), float64(1), object(1)
memory usage: 1.7+ GB


In [6]:
final_prices_df['secstat'].unique()

array(['A', 'I'], dtype=object)

In [7]:
final_prices_df[final_prices_df['secstat'] == 'I']

Unnamed: 0,cusip,datadate,secstat,price_adj
1,000781104,1986-01-02,I,2.305897
2,000872309,1986-01-02,I,4.480000
3,000886309,1986-01-02,I,2.065973
4,001030105,1986-01-02,I,15.250000
5,001058205,1986-01-02,I,844.000000
...,...,...,...,...
69597953,G2426E104,2024-08-09,I,11.600000
69598626,25271C201,2024-08-09,I,14.630000
69599148,887133205,2024-08-09,I,0.087500
69599149,887133205,2024-08-09,I,0.110000


In [8]:
final_prices_df.shape

(69600435, 4)