In [16]:
# Data file
path = './../data/price_data_2005-2022.csv'

# Exchanges to keep - 1. NYSE, 2. AMEX, 3. NASDAQ
filter_exchanges = True
exchanges_to_keep = [1, 3]

# Filter finacial stocks
filter_financial = False

# Fill forward zero prices
fill_forward_prices = True
fix_negative_prices = True

# Substitute code 'B' with returns
substitute_code_B = True
# Substitute code 'C' with returns
substitute_code_C = True
# Drop code 'B' and 'C'
drop_code_B_and_C = False

# Keep only stocks that have been below threshold
keep_only_below_threshold = True
threshold = 6

# Remove missing SHROUT
remove_missing_shrout = True

# Conver SHROUT from thousands
convert_shrout_from_thousands = True


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Read in data
price_data = pd.read_csv(path)

  price_data = pd.read_csv(path)


In [4]:
display(price_data)

Unnamed: 0,PERMNO,date,EXCHCD,SICCD,TICKER,TRDSTAT,CUSIP,DLSTCD,DLRET,PRC,RET,SHROUT
0,10001,2005/01/03,3.0,4920,EWST,A,36720410,,,6.670,-0.033333,2599.0
1,10001,2005/01/04,3.0,4920,EWST,A,36720410,,,6.510,-0.023988,2599.0
2,10001,2005/01/05,3.0,4920,EWST,A,36720410,,,6.700,0.029186,2599.0
3,10001,2005/01/06,3.0,4920,EWST,A,36720410,,,6.510,-0.028358,2599.0
4,10001,2005/01/07,3.0,4920,EWST,A,36720410,,,6.546,0.005530,2599.0
...,...,...,...,...,...,...,...,...,...,...,...,...
33089264,93436,2022/12/23,3.0,9999.0,TSLA,A,88160R10,,,123.150,-0.017551,3157752.0
33089265,93436,2022/12/27,3.0,9999.0,TSLA,A,88160R10,,,109.100,-0.114089,3157752.0
33089266,93436,2022/12/28,3.0,9999.0,TSLA,A,88160R10,,,112.710,0.033089,3157752.0
33089267,93436,2022/12/29,3.0,9999.0,TSLA,A,88160R10,,,121.820,0.080827,3157752.0


In [5]:
# PERMNO where RET == 'C'
price_data[price_data['RET'] == 'C']['PERMNO'].unique()

p = price_data[price_data['PERMNO'] == 10028]

In [6]:
if(filter_exchanges):
    # Group by permno and check if exchcd was ever in exchanges_to_keep
    exchange_filter = price_data.groupby('PERMNO')['EXCHCD'].value_counts().unstack()
    exchange_filter = exchange_filter[exchanges_to_keep].sum(axis=1) > 0
    # Filter data
    price_data = price_data[price_data['PERMNO'].isin(exchange_filter[exchange_filter].index)]

In [7]:
if(filter_financial):
    # We want to filter out all stocks that were ever in the financial sector
    # We do this by filtering out all stocks that have a SICCD that starts with 6
    in_finance = price_data.groupby('PERMNO')['SICCD'].apply(lambda x: any([str(siccd).startswith('6') for siccd in x]))
    # Filter data
    price_data = price_data[~price_data['PERMNO'].isin(in_finance[in_finance].index)]

In [8]:
if(fill_forward_prices):
    # Fill forward prices that are zero
    price_data['PRC'] = price_data.groupby('PERMNO')['PRC'].apply(lambda x: x.replace(0, np.nan).ffill())

In [9]:
if(fix_negative_prices):
    # Fix negative prices
    price_data['PRC'] = abs(price_data['PRC'])

In [10]:
if(substitute_code_B):
    # Substitute code 'B' with actual returns calculated from PRC
    price_data['prev_price'] = price_data.groupby('PERMNO')['PRC'].shift(1)

    code_b = price_data['RET'] == 'B'
    price_data.loc[code_b, 'RET'] = price_data[code_b]['PRC'] / price_data[code_b]['prev_price'] - 1
    # Drop prev_price
    price_data = price_data.drop('prev_price', axis=1)

In [11]:
if(substitute_code_C):
    # Substitute code 'C' with actual returns calculated from PRC
    price_data['prev_price'] = price_data.groupby('PERMNO')['PRC'].shift(1)
    
    code_c = price_data['RET'] == 'C'
    price_data.loc[code_c, 'RET'] = price_data[code_c]['PRC'] / price_data[code_c]['prev_price'] - 1

    # Drop prev_price
    price_data = price_data.drop('prev_price', axis=1)

In [12]:
if(drop_code_B_and_C):
    code_b = price_data['RET'] == 'B'
    code_c = price_data['RET'] == 'C'

    price_data = price_data[~(code_b | code_c)]

In [13]:
if(keep_only_below_threshold):
    # Keep only stocks that have been below threshold
    below_threshold = price_data.groupby('PERMNO')['PRC'].min() < threshold
    # Filter data
    price_data = price_data[price_data['PERMNO'].isin(below_threshold[below_threshold].index)]

In [21]:
print(price_data['SHROUT'].isna().sum())

if(remove_missing_shrout):
    # Remove missing SHROUT
    price_data = price_data[~price_data['SHROUT'].isna()]

3400


In [22]:
if(convert_shrout_from_thousands):
    # Convert SHROUT from thousands
    price_data['SHROUT'] = price_data['SHROUT'] * 1000

In [14]:
# Export data
price_data.to_csv('./../data/price_data_cleaned.csv', index=False)
price_data.to_pickle('./../data/price_data_cleaned.pkl')