## Load required datasets
1. `all_transactions_final`: output of `sec_data_merging.ipynb`
2. `all_beta_daily`: beta and return values from crsp data for all tickers 
3. `risk_free_rate_daily`: risk free rate from 2005 to 2021, downloaded from https://fred.stlouisfed.org/series/DTB3
    * This rate is the same for ALL tickers, joined on the transaction date
4. `unique_ticker_trans_8above`: unique list of tickers with more than 8 transactions from Form 4 merged data
    * This list is to create abnormal return calculations for each ticker, in order for computational efficiency
5. **Note that** `stock_price`: is the daily stock price for each ticker, loaded for each ticker in abnormal return calculation because the data files are stored as '<ticker>.csv'

In [None]:
import os
import glob
import math
import re
import pandas as pd
import numpy as np
from numba import njit
from dask import delayed, compute
from dask.distributed import Client
from dask.diagnostics import ProgressBar
from tqdm.notebook import tqdm

# -----------------------------
# 1. Setup and Data Loading
# -----------------------------

# Start a Dask client (using CPU threads)
client = Client()
# Explicitly set scheduler (threads) if needed
import dask
dask.config.set(scheduler='threads')

# Define data folder paths and global parameters
DATA_FOLDER = "."  # adjust as needed
STOCK_DATA_PATH = "daily_stock_data_by_ticker"
MAX_DAYS_BEFORE_TRANSACTION = 120
MAX_DAYS_AFTER_TRANSACTION = 120
DAYS_BEFORE_TRANSACTION = 6   # for CAR calculation (example windows)
DAYS_AFTER_TRANSACTION = 2
MIN_AR_VALUES = 3

def load_beta_data():
    """Load and concatenate split beta CSV files."""
    csv_files = glob.glob(os.path.join("daily_beta_split", "part_*.csv"))
    beta_dfs = []
    for file in csv_files:
        print(f"Processing beta file: {file}")
        df = pd.read_csv(file)
        df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')
        df.rename(columns={'RET': 'ret_beta'}, inplace=True)
        beta_dfs.append(df)
    if not beta_dfs:
        raise ValueError("No beta files found!")
    return pd.concat(beta_dfs, ignore_index=True)

def load_risk_free_rate():
    """Load risk free rate data."""
    rfr = pd.read_csv("risk_free_rate_daily.csv")
    rfr.rename(columns={'observation_date': 'date', 'DTB3': 'risk_free_rate'}, inplace=True)
    rfr['date'] = pd.to_datetime(rfr['date'], errors='coerce')
    return rfr

def load_transactions():
    """Load transactions data."""
    tx = pd.read_csv("all_transaction_final_filter_2.csv") 
    #tx = pd.read_csv("test.csv")
    tx['TRANS_DATE'] = pd.to_datetime(tx['TRANS_DATE'], errors='coerce')
    return tx

# Load global data
all_beta_daily = load_beta_data()
risk_free_rate_daily = load_risk_free_rate()
all_transactions_final = load_transactions()
print("Shape of all transactions", all_transactions_final.shape) # Expect (3546490, 25)

# -----------------------------
# 2. Ticker Cleaning
# -----------------------------
def clean_ticker(ticker):
    """
    Clean a ticker string:
      - Remove surrounding quotes and whitespace.
      - Remove trailing suffixes such as .OB or .PK.
      - Flag as weird if it contains digits, '??', or isn't all alphabets.
    Returns a tuple: (cleaned_ticker, is_weird)
    """
    if pd.isna(ticker) or not isinstance(ticker, str):
        return None, True
    cleaned = ticker.strip().strip('(').strip(')').strip('[').strip(']').strip('"').strip("'").strip()
    cleaned = (lambda s: (lambda parts: ''.join(a for a, b in zip(parts[0], parts[1]) if a == b) if len(parts) == 2 else s)(re.split(r'[,/; ]+', s)))(cleaned.split(':', 1)[1].strip() if ':' in cleaned else cleaned)
    cleaned = re.sub(r'\.(OB|PK)$', '', cleaned, flags=re.IGNORECASE)
    if re.search(r'\d', cleaned) or "??" in cleaned or not re.fullmatch(r'[A-Za-z]+', cleaned):
        return cleaned, True
    return cleaned, False

# Apply cleaning to the transactions ticker column
clean_results = all_transactions_final['ISSUERTRADINGSYMBOL'].apply(clean_ticker)
all_transactions_final['clean_ticker'] = clean_results.apply(lambda t: t[0])
all_transactions_final['is_weird_ticker'] = clean_results.apply(lambda t: t[1])

# Split transactions into clean and weird (you can log weird ones for review)
weird_transactions = all_transactions_final[all_transactions_final['is_weird_ticker']].copy()
clean_transactions = all_transactions_final[
    (~all_transactions_final['is_weird_ticker']) &
    (all_transactions_final['clean_ticker'].notna()) &
    (all_transactions_final['clean_ticker'] != '')
].copy()

unique_clean_tickers = clean_transactions['clean_ticker'].unique()

print("Total transactions:", all_transactions_final.shape[0])
print("Weird transactions:", weird_transactions.shape[0])
print("Clean transactions:", clean_transactions.shape[0])
print("Unique clean tickers count:", len(unique_clean_tickers))


Perhaps you already have a cluster running?
Hosting the HTTP server on port 53991 instead


Processing beta file: split_csv_parts/part_1.csv
Processing beta file: split_csv_parts/part_2.csv
Processing beta file: split_csv_parts/part_3.csv
Processing beta file: split_csv_parts/part_4.csv
Processing beta file: split_csv_parts/part_5.csv
Shape of all transactions (3546490, 25)
Total transactions: 3546490
Weird transactions: 25160
Clean transactions: 3521330
Unique clean tickers count: 18779


In [11]:
import os

unique_clean_tickers_found = []
tickers_not_found = []
for ticker in unique_clean_tickers:
    file_path = f"{STOCK_DATA_PATH}/{ticker}.csv"
    if os.path.exists(file_path):
        unique_clean_tickers_found.append(ticker)
    else:
        tickers_not_found.append(ticker)

print("Tickers not found: ", tickers_not_found)
print("Unique clean tickers with CRSP data found: ", len(unique_clean_tickers_found))


Tickers not found:  ['ISCO', 'DDOOD', 'VOIL', 'BRMH', 'VSYS', 'EFSI', 'MMAB', 'FROZ', 'ROKO', 'AGIN', 'KRED', 'GPCM', 'LBTY', 'AGTK', 'COTE', 'ERFB', 'MEEC', 'PSMH', 'SHPR', 'NONE', 'FLPC', 'BNGOF', 'HENC', 'CDEX', 'BKFG', 'SFDL', 'dcbf', 'ESWW', 'MINE', 'UDHI', 'FDNH', 'JAMN', 'TAUG', 'STCC', 'SWRL', 'NUIN', 'CNBX', 'BLGO', 'RDMP', 'SFEF', 'RIBS', 'RMMG', 'NJMC', 'NROM', 'none', 'CMGO', 'FARE', 'APVS', 'BKGM', 'FCRM', 'GTHP', 'AMNC', 'ALYI', 'SRGZ', 'INVU', 'NBRI', 'RBCL', 'PAYM', 'PHRX', 'HPTO', 'IBAL', 'DIXI', 'CYNX', 'EUSP', 'PFHO', 'ABCP', 'GLYE', 'USEL', 'DSNY', 'AMSS', 'EXGI', 'APHD', 'UVIC', 'SMDM', 'CHSCN', 'MDBX', 'SCRC', 'DJRT', 'CMXX', 'TURV', 'abcp', 'CYDY', 'ADMD', 'GRMC', 'IHRC', 'SVLT', 'TRUU', 'RUSH', 'XPWR', 'EFLO', 'MOG', 'IDAH', 'VYST', 'NVIC', 'PCYN', 'KTYB', 'IRNS', 'LCDX', 'TALN', 'CTIG', 'LSCG', 'BFGC', 'LWV', 'WELX', 'BMNM', 'GCMN', 'BICX', 'CSBB', 'prka', 'cnig', 'OSGIQ', 'PARF', 'GBLX', 'THNS', 'ECIG', 'FOHL', 'FMCB', 'LOGL', 'NXFI', 'PRLE', 'ELST', 'itnm', '

In [12]:
# -----------------------------
# 3. Helper Functions for Merging & Imputation
# -----------------------------
def categorical_impute(series):
    """Impute missing values by carrying forward if the surrounding values are identical."""
    s = series.copy()
    for i in range(1, len(s) - 1):
        if pd.isna(s.iloc[i]):
            prev_idx = i - 1
            next_idx = i + 1
            while next_idx < len(s) and pd.isna(s.iloc[next_idx]):
                next_idx += 1
            if prev_idx >= 0 and next_idx < len(s) and s.iloc[prev_idx] == s.iloc[next_idx]:
                s.iloc[i] = s.iloc[prev_idx]
    return s

def mean_impute_between_values(series):
    """Impute missing numeric values by taking the mean of the nearest non-NA values."""
    s = series.copy()
    for i in range(1, len(s) - 1):
        if pd.isna(s.iloc[i]):
            prev_idx = i - 1
            next_idx = i + 1
            while next_idx < len(s) and pd.isna(s.iloc[next_idx]):
                next_idx += 1
            if prev_idx >= 0 and next_idx < len(s) and not pd.isna(s.iloc[prev_idx]) and not pd.isna(s.iloc[next_idx]):
                s.iloc[i] = (s.iloc[prev_idx] + s.iloc[next_idx]) / 2
    return s

In [13]:
@delayed
def update_trans_ticker_and_create_ar_compare_optimized(ticker, permno, trans_df_permno, stock_price_df, 
                                                         beta_ticker_df, risk_free_rate_df, days_before, days_after, 
                                                         min_ar_values):
    """
    Build daily data (ticker_ar) for a given ticker & PERMNO, compute abnormal returns, and then compute 
    CAR over various windows using a vectorized approach.
    Returns (ticker_ar, trans_ticker_price).
    """
    import numpy as np
    import pandas as pd

    # Determine transaction date range
    start = pd.to_datetime(trans_df_permno['TRANS_DATE'].min()).normalize()
    end = pd.to_datetime(trans_df_permno['TRANS_DATE'].max()).normalize()
    start_new = start - pd.to_timedelta(days_before, unit='d')
    end_new = end + pd.to_timedelta(days_after, unit='d')

    # Create daily date DataFrame
    ticker_ar = pd.DataFrame({'date': pd.date_range(start=start_new, end=end_new)})
    
    # Merge in stock price, beta, and risk-free rate data (filtered by PERMNO)
    stock_price_filtered = stock_price_df[stock_price_df['PERMNO'] == permno]
    beta_ticker_filtered = beta_ticker_df[beta_ticker_df['PERMNO'] == permno]
    
    ticker_ar = ticker_ar.merge(
        stock_price_filtered[['date', 'TICKER', 'RET', 'RETX', 'VOL']].drop_duplicates(),
        on='date', how='left'
    )
    ticker_ar = ticker_ar.merge(
        beta_ticker_filtered[['DATE', 'PERMNO', 'ret_beta', 'alpha', 'b_mkt']],
        left_on='date', right_on='DATE', how='left'
    )
    ticker_ar = ticker_ar.merge(risk_free_rate_df, on='date', how='left')
    
    # Identification and conversion of returns
    ticker_ar['TICKER'] = ticker
    ticker_ar['PERMNO'] = permno

    # Convert percentage string to float
    ticker_ar['actual_ret'] = (
        pd.to_numeric(ticker_ar['ret_beta'].str.replace('%', ''), errors='coerce') / 100
    )

    # Impute missing values (assumes mean_impute_between_values is vectorized)
    ticker_ar['actual_ret'] = mean_impute_between_values(ticker_ar['actual_ret'])
    ticker_ar['b_mkt'] = mean_impute_between_values(ticker_ar['b_mkt'])
    ticker_ar['risk_free_rate'] = mean_impute_between_values(ticker_ar['risk_free_rate'])

    # Calculate expected and abnormal returns
    ticker_ar['expected_ret'] = ticker_ar['risk_free_rate'] + ticker_ar['b_mkt'] * (ticker_ar['actual_ret'] - ticker_ar['risk_free_rate'])
    ticker_ar['abnormal_ret'] = ticker_ar['actual_ret'] - ticker_ar['expected_ret']

    # Sort by date and compute cumulative abnormal returns
    ticker_ar = ticker_ar.sort_values('date').reset_index(drop=True)
    ticker_ar['cum_abnormal'] = ticker_ar['abnormal_ret'].cumsum()

    # Merge daily data with transaction data
    trans_ticker_price = trans_df_permno.merge(
        ticker_ar[['date', 'actual_ret', 'b_mkt', 'risk_free_rate', 'expected_ret', 'abnormal_ret']],
        left_on='TRANS_DATE', right_on='date', how='left'
    )

    # Prepare for vectorized CAR calculation:
    # Convert daily dates and cumulative abnormal returns to numpy arrays.
    daily_dates = ticker_ar['date'].values
    cum_abnormal = ticker_ar['cum_abnormal'].values

    # Helper function to compute CAR vectorized over all transaction dates.
    def vectorized_car(trans_dates, days_before, days_after, min_days):
        # Find indices for transaction dates in daily data
        positions = np.searchsorted(daily_dates, trans_dates)
        if days_after == 0: 
            start_idx = positions - days_before - 1  # start day of window
            end_idx   = positions                 # trans_date (window end)
        else:
            start_idx = positions - 1             # one day before trans_date
            end_idx   = positions + days_after    # end day of window
        
        # Ensure that both indices are within valid range
        end_idx = np.clip(end_idx, 0, len(cum_abnormal) - 1)
        start_idx = np.clip(start_idx, 0, len(cum_abnormal) - 1)

        # Calculate the window length as the difference in indices (this is a proxy for the number of days)
        window_length = end_idx - start_idx

        # Compute the cumulative abnormal return for the window
        car_n = cum_abnormal[end_idx] - cum_abnormal[start_idx]
        
        # If the window length is less than min_days, assign np.nan to that element.
        car_n = np.where(window_length < min_days, np.nan, car_n)
        return car_n

    # Convert transaction dates to numpy datetime64 array
    trans_dates = pd.to_datetime(trans_ticker_price['TRANS_DATE']).values

    # Define the different windows and their parameters:
    # Format: (days_before, days_after, min_days, column_name)
    window_params = [
        (5, 0, 4, 'CAR_5_before'),
        (0, 5, 4, 'CAR_5_after'),
        (30, 0, 25, 'CAR_30_before'),
        (0, 30, 25, 'CAR_30_after'),
        (60, 0, 45, 'CAR_60_before'),
        (0, 60, 45, 'CAR_60_after'),
        (120, 0, 100, 'CAR_120_before'),
        (0, 120, 100, 'CAR_120_after'),
    ]

    # Compute CAR for each window vectorized.
    for days_before_win, days_after_win, min_days, col_name in window_params:
        trans_ticker_price[col_name] = vectorized_car(trans_dates, days_before_win, days_after_win, min_days)

    return ticker_ar, trans_ticker_price


In [14]:
@delayed
def process_ticker(ticker, clean_transactions, stock_data_path, all_beta_daily, risk_free_rate_df,
                   days_before, days_after, min_ar_values):
    """
    Process a single ticker:
      1. Load its stock price data.
      2. Filter transactions.
      3. Merge transactions with stock data using merge_asof.
      4. Determine unique PERMNOs:
         - If one PERMNO, process normally.
         - If multiple, check if date ranges are disjoint:
             * If disjoint, process each PERMNO separately.
             * If overlapping, if the smallest group is ≤20% of total, use the dominant PERMNO; else, skip.
      5. Call update_trans_ticker_and_create_ar_compare.
    Returns a list of tuples (one per processed PERMNO).
    """
    try:
        stock_price = pd.read_csv(os.path.join(stock_data_path, f"{ticker}.csv"))
    except Exception as e:
        print(f"Ticker {ticker} price data not found: {e}")
        return []
    
    stock_price = stock_price.dropna(subset=['PRC'])
    stock_price['date'] = pd.to_datetime(stock_price['date'], errors='coerce')
    
    trans_ticker = clean_transactions[clean_transactions['ISSUERTRADINGSYMBOL'] == ticker]
    if trans_ticker.empty:
        print(f"No transactions found for ticker {ticker}")
        return []
    
    # Drop rows where TRANS_PRICEPERSHARE is null to avoid merge issues
    trans_ticker = trans_ticker.dropna(subset=['TRANS_PRICEPERSHARE'])
    
    # Filter stock price by transaction date range
    start = str(trans_ticker['TRANS_DATE'].min())[:10]
    end = str(trans_ticker['TRANS_DATE'].max())[:10]
    stock_price = stock_price[(stock_price['date'] >= start) & (stock_price['date'] <= end)]
    stock_price['PRC'] = stock_price['PRC'].abs()
    
    trans_ticker_price = pd.merge_asof(
        trans_ticker.sort_values('TRANS_PRICEPERSHARE'),
        stock_price[['PERMNO', 'date', 'VOL', 'PRC', 'RET', 'TICKER']].sort_values('PRC'),
        left_on='TRANS_PRICEPERSHARE', right_on='PRC',
        left_by='TRANS_DATE', right_by='date', direction='nearest'
    )
    
    # Impute missing PERMNO values
    trans_ticker_price['PERMNO'] = categorical_impute(trans_ticker_price['PERMNO'])
    
    # Set beta data for this ticker
    beta_ticker = all_beta_daily[all_beta_daily['TICKER'] == ticker]
    
    # Compute unique PERMNO counts
    permno_count_dict = trans_ticker_price['PERMNO'].value_counts().to_dict()
    permno_unique = len(permno_count_dict)
    
    # Log if multiple PERMNOs are found
    if permno_unique > 1:
        print(f"Ticker {ticker} has multiple PERMNOs: {permno_count_dict}")
    
    results = []
    if permno_unique == 1:
        permno = list(permno_count_dict.keys())[0]
        results.append(update_trans_ticker_and_create_ar_compare_optimized(
            ticker, permno,
            trans_ticker_price[trans_ticker_price['PERMNO'] == permno],
            stock_price, beta_ticker, risk_free_rate_df,
            days_before, days_after, min_ar_values
        ))
    elif permno_unique > 1:
        # Check if date ranges for each PERMNO are disjoint
        permno_ranges = trans_ticker_price.groupby('PERMNO')['TRANS_DATE'].agg(['min', 'max']).reset_index()
        sorted_ranges = permno_ranges.sort_values(by='min').reset_index(drop=True)
        is_disjoint = True
        for i in range(len(sorted_ranges) - 1):
            if sorted_ranges.loc[i, 'max'] >= sorted_ranges.loc[i + 1, 'min']:
                is_disjoint = False
                break
        if is_disjoint:
            for permno in permno_count_dict.keys():
                results.append(update_trans_ticker_and_create_ar_compare_optimized(
                    ticker, permno,
                    trans_ticker_price[trans_ticker_price['PERMNO'] == permno],
                    stock_price, beta_ticker, risk_free_rate_df,
                    days_before, days_after, min_ar_values
                ))
        else:
            permno_counts = trans_ticker_price['PERMNO'].value_counts()
            min_permno = permno_counts.idxmin()
            if permno_counts[min_permno] / len(trans_ticker_price) <= 0.20:
                max_permno = permno_counts.idxmax()
                results.append(update_trans_ticker_and_create_ar_compare_optimized(
                    ticker, max_permno,
                    trans_ticker_price[trans_ticker_price['PERMNO'] == max_permno],
                    stock_price, beta_ticker, risk_free_rate_df,
                    days_before, days_after, min_ar_values
                ))
            else:
                print(f"Ticker {ticker} has overlapping PERMNO date ranges exceeding threshold; skipping processing.")
                return []
    else:
        return []
    
    if not results:
        print(f"No results produced for ticker {ticker}")
    return results


In [15]:
import math
from dask import compute
from dask.diagnostics import ProgressBar
from tqdm.notebook import tqdm

# Set your desired batch size
BATCH_SIZE = 200
unique_ticker_list = list(unique_clean_tickers_found) # Replace from unique_clean_tickers
print(len(unique_ticker_list))
num_batches = math.ceil(len(unique_ticker_list) / BATCH_SIZE)

# This will hold all valid results across batches.
global_results = []

# Process tickers in batches
for batch_start in tqdm(range(0, len(unique_ticker_list), BATCH_SIZE),
                         total=num_batches, desc="Processing batches"):
    batch_tickers = unique_ticker_list[batch_start: batch_start + BATCH_SIZE]
    batch_delayed_tasks = []
    for ticker in batch_tickers:
        batch_delayed_tasks.append(
            process_ticker(
                ticker,
                clean_transactions,
                STOCK_DATA_PATH,
                all_beta_daily,
                risk_free_rate_daily,
                DAYS_BEFORE_TRANSACTION,
                DAYS_AFTER_TRANSACTION,
                MIN_AR_VALUES
            )
        )
    
    # Compute tasks for the current batch.
    with ProgressBar():
        batch_computed = compute(*batch_delayed_tasks)
    
    # Flatten the list of lists for this batch.
    batch_flattened = [
        item
        for sublist in batch_computed
        for item in (sublist if sublist is not None else [])
    ]
    
    # Check and print results for the batch.
    if not batch_flattened:
        print(f"Batch {batch_start // BATCH_SIZE + 1} produced no valid results.")
    else:
        print(f"Batch {batch_start // BATCH_SIZE + 1} produced {len(batch_flattened)} valid results.")
        global_results.extend(batch_flattened)

 # After processing all batches, check overall results, and concatenate dataframes
if not global_results:
    print("No valid results were produced overall.")

else:
    print("Processing complete. Total valid results:", len(global_results))
    
    print("--------- Start concatenating dataframes for all_trans_ar ---------")
    # Define partial metadeta for Dask DataFrame
    meta_all_trans_ar = pd.DataFrame({'trans_date': pd.Series(dtype='datetime64[ns]'),
                                      'RET': pd.Series(dtype='float64'),  # RET must be pre-defined to catch errors
                                      'PERMNO': pd.Series(dtype='int64'),
                                      'ticker': pd.Series(dtype='object'), 
                                      'PRC': pd.Series(dtype='float64') })

    # Create Dask Dataframe (alternative: pd.concat(dask.compute(*delayed_objs), axis=0))
    delayed_dfs = [res[1] for res in global_results if res is not None]
    #ddf = dd.from_map(lambda x: x.compute(), delayed_dfs, meta=meta_all_trans_ar) # meta is required 
    num_batches = math.ceil(len(delayed_dfs) / BATCH_SIZE)

    for i in range(num_batches):
        print(f"--------- Processing batch {i+1}/{num_batches}...")

        batch = delayed_dfs[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
        ddf_batch = dd.from_map(lambda x: x.compute(), batch, meta=meta_all_trans_ar)
        
        with ProgressBar():
            df_batch = ddf_batch.compute()
        
        output_path = os.path.join(f"trans_ar_batch_{i+1}.csv")
        df_batch.to_csv(output_path, index=False)
        print(f"Batch {i+1} written to {output_path} with shape {df_batch.shape}")
        
        '''print("--------- Start concatenating dataframes for all_ticker_ar_compare ---------")
        # Define partial metadeta for Dask DataFrame
        meta_all_ticker_ar_compare = pd.DataFrame({'date': pd.Series(dtype='datetime64[ns]'),
                                                'alpha': pd.Series(dtype='float64'),
                                                'b_mkt': pd.Series(dtype='float64')})
        
        # Create Dask Dataframe
        delayed_dfs = [res[0] for res in global_results if res is not None]
        ddf = dd.from_map(lambda x: x.compute(), delayed_dfs, meta=meta_all_ticker_ar_compare) # meta is required 
        with ProgressBar():
            all_ticker_ar_compare = ddf.compute()
        print("Shape of all_ticker_ar_compare", all_ticker_ar_compare.shape)'''

print("All batches processed.")
    


12298


Processing batches:   0%|          | 0/62 [00:00<?, ?it/s]

[                                        ] | 0% Completed | 121.54 ms



[                                        ] | 1% Completed | 11.79 smsTicker VRA has multiple PERMNOs: {12363.0: 1085, 80900.0: 17}
[#                                       ] | 4% Completed | 17.87 sTicker TWTR has multiple PERMNOs: {14295.0: 1661, 86298.0: 16}
[####                                    ] | 11% Completed | 33.48 sTicker APP has multiple PERMNOs: {20894: 137, 91136: 91}
[#####                                   ] | 13% Completed | 44.65 sTicker EXXI has multiple PERMNOs: {92215.0: 535, 16587.0: 7}
[#########                               ] | 22% Completed | 64.40 sTicker EXP has multiple PERMNOs: {80415.0: 374, 89983.0: 3}
[##########                              ] | 26% Completed | 74.07 sTicker GME has multiple PERMNOs: {89301.0: 322, 90379.0: 7}
[#############                           ] | 34% Completed | 93.24 sTicker MOVE has multiple PERMNOs: {87165.0: 428, 20848.0: 16}
[##################                      ] | 45% Completed | 118.19 sNo transactions found for tick



[##                                      ] | 6% Completed | 18.57 sTicker AAN has multiple PERMNOs: {10517.0: 286, 78049.0: 41, 20064.0: 12}
[########                                ] | 20% Completed | 51.68 sTicker PLL has multiple PERMNOs: {35051.0: 765, 21325.0: 20}
[############                            ] | 32% Completed | 80.68 sTicker ANGI has multiple PERMNOs: {16921.0: 489, 13106.0: 312}
[############                            ] | 32% Completed | 81.18 sTicker ANGI has overlapping PERMNO date ranges exceeding threshold; skipping processing.
[#################                       ] | 43% Completed | 108.79 sTicker LMCA has multiple PERMNOs: {13757.0: 394, 13008.0: 44}
[##################                      ] | 46% Completed | 118.40 sTicker OLED has multiple PERMNOs: {77610.0: 383, 90485.0: 6}
[#######################                 ] | 58% Completed | 148.27 sTicker WOOF has multiple PERMNOs: {89213.0: 635, 20341.0: 18}
[########################                ] | 62% C



[#                                       ] | 4% Completed | 15.82 smsNo transactions found for ticker FCE
[####                                    ] | 12% Completed | 31.71 sTicker VRX has multiple PERMNOs: {80307.0: 192, 68340.0: 159}
[#######                                 ] | 18% Completed | 47.78 sTicker BLUE has multiple PERMNOs: {13947.0: 1182, 91170.0: 38}
[##############                          ] | 35% Completed | 92.42 sTicker UBNK has multiple PERMNOs: {90824: 82, 90711: 75}
[###############                         ] | 38% Completed | 99.06 sTicker KAR has multiple PERMNOs: {93174.0: 434, 90197.0: 4}
[####################                    ] | 50% Completed | 132.05 sTicker GPOR has multiple PERMNOs: {91128: 116, 21219: 11}
[#########################               ] | 63% Completed | 163.05 sTicker PZG has multiple PERMNOs: {92254.0: 144, 15278.0: 89}
[##########################              ] | 66% Completed | 169.43 sTicker MAIN has multiple PERMNOs: {92309.0: 4996, 7638



[##                                      ] | 6% Completed | 17.88 smsTicker WM has multiple PERMNOs: {11955: 639, 81593: 1}
[###                                     ] | 8% Completed | 21.27 sTicker MRD has multiple PERMNOs: {14703.0: 135, 50666.0: 39}
[#####                                   ] | 14% Completed | 35.00 sTicker HVT has multiple PERMNOs: {10294.0: 276, 41217.0: 236}
Ticker HVT has overlapping PERMNO date ranges exceeding threshold; skipping processing.
[######                                  ] | 16% Completed | 40.71 sTicker PLD has multiple PERMNOs: {85592.0: 221, 80406.0: 80}
[#######                                 ] | 19% Completed | 49.19 sTicker SWI has multiple PERMNOs: {92931.0: 438, 18183.0: 97}
[#########                               ] | 24% Completed | 62.40 sTicker DOW has multiple PERMNOs: {20626.0: 253, 18428.0: 51}
[############                            ] | 31% Completed | 77.40 sTicker MTSI has multiple PERMNOs: {13326: 679, 11898: 12}
[################



[###                                     ] | 8% Completed | 22.77 smsTicker H has multiple PERMNOs: {93098.0: 1219, 91388.0: 114}
[###                                     ] | 9% Completed | 26.38 sTicker DDD has multiple PERMNOs: {78664.0: 1016, 90049.0: 137}
[#######                                 ] | 17% Completed | 45.18 sTicker CVT has multiple PERMNOs: {14063.0: 382, 72776.0: 6}
[#######                                 ] | 18% Completed | 48.60 sTicker PNK has multiple PERMNOs: {42140.0: 205, 16001.0: 124}
[########                                ] | 20% Completed | 53.42 sTicker FPI has multiple PERMNOs: {14566: 182, 91296: 4}
[###########                             ] | 28% Completed | 71.20 sTicker MATR has multiple PERMNOs: {87602.0: 456, 76110.0: 206}
[############                            ] | 32% Completed | 80.87 sTicker IT has multiple PERMNOs: {79698.0: 1638, 87030.0: 3}
[#############                           ] | 32% Completed | 83.03 sTicker MM has multiple PERMNOs:



[###                                     ] | 7% Completed | 20.76 smsTicker HCAC has multiple PERMNOs: {18558: 11, 14543: 2}
[###                                     ] | 9% Completed | 25.10 sTicker WEST has multiple PERMNOs: {81226.0: 71, 92295.0: 16}
[#####                                   ] | 14% Completed | 37.67 sTicker SR has multiple PERMNOs: {72494: 357, 12781: 28}
[#######                                 ] | 17% Completed | 43.65 sTicker FTD has multiple PERMNOs: {90554.0: 399, 14247.0: 31}
[#########                               ] | 23% Completed | 59.58 sTicker TSC has multiple PERMNOs: {13943: 262, 84911: 107}
[#########                               ] | 24% Completed | 60.81 sTicker SMRT has multiple PERMNOs: {77530.0: 370, 20599.0: 14}
[###########                             ] | 27% Completed | 69.62 sTicker DV has multiple PERMNOs: {76708.0: 2507, 20920.0: 17}
[############                            ] | 32% Completed | 80.52 sTicker TIVO has multiple PERMNOs: {87255:



[                                        ] | 1% Completed | 5.18 s msTicker BV has multiple PERMNOs: {13261.0: 319, 17867.0: 16}
[##                                      ] | 5% Completed | 16.12 sTicker IVZ has multiple PERMNOs: {19583.0: 421, 81910.0: 9}
[##                                      ] | 7% Completed | 18.88 sTicker KNX has multiple PERMNOs: {80987.0: 363, 12473.0: 110}
[###                                     ] | 7% Completed | 19.94 sTicker KNX has overlapping PERMNO date ranges exceeding threshold; skipping processing.
[####                                    ] | 10% Completed | 26.36 sTicker IR has multiple PERMNOs: {12431.0: 423, 16692.0: 47}
[#######################                 ] | 57% Completed | 147.15 sTicker FSL has multiple PERMNOs: {12787: 146, 90435: 26, 90251: 23}
[########################                ] | 61% Completed | 154.28 sTicker JCI has multiple PERMNOs: {42534.0: 267, 45356.0: 101}
[####################################    ] | 91% Completed | 226



[#                                       ] | 4% Completed | 15.07 smsTicker CNVR has multiple PERMNOs: {33690.0: 79, 88207.0: 11}Ticker BWC has multiple PERMNOs: {10220.0: 34, 90918.0: 14}

[#####                                   ] | 12% Completed | 34.53 sTicker ROSE has multiple PERMNOs: {91100.0: 173, 15970.0: 161}
[#####                                   ] | 13% Completed | 36.49 sTicker FUEL has multiple PERMNOs: {14164.0: 115, 84311.0: 54}
[######                                  ] | 15% Completed | 41.11 sTicker PTSX has multiple PERMNOs: {92223.0: 126, 84565.0: 53}
[######                                  ] | 16% Completed | 45.65 sTicker NE has multiple PERMNOs: {90537: 87, 21430: 26}
[############                            ] | 31% Completed | 80.48 sTicker DOC has multiple PERMNOs: {14036: 85, 87035: 4}
[#############                           ] | 34% Completed | 88.24 sTicker ACET has multiple PERMNOs: {10656: 247, 17272: 44}
[##################                      ] | 45



[#                                       ] | 4% Completed | 11.03 sTicker WSTC has multiple PERMNOs: {13830.0: 181, 84283.0: 147}
[###                                     ] | 8% Completed | 21.73 sTicker GIG has multiple PERMNOs: {13340: 60, 17245: 1}
[####                                    ] | 10% Completed | 26.62 sTicker DLPH has multiple PERMNOs: {13103: 308, 17040: 8}
[########                                ] | 20% Completed | 52.10 sTicker DD has multiple PERMNOs: {11703: 246, 16851: 9}
[#############                           ] | 34% Completed | 84.97 sTicker SWFT has multiple PERMNOs: {12473.0: 146, 76214.0: 10}
[##############                          ] | 36% Completed | 88.74 sTicker FAF has multiple PERMNOs: {93374.0: 121, 35124.0: 87}
Ticker FAF has overlapping PERMNO date ranges exceeding threshold; skipping processing.
[###############                         ] | 37% Completed | 93.20 sTicker WES has multiple PERMNOs: {92672.0: 146, 13738.0: 14, 10211.0: 12}
[##########



[###                                     ] | 8% Completed | 24.16 smsTicker CORI has multiple PERMNOs: {14594.0: 52, 88153.0: 28}
[#######                                 ] | 18% Completed | 50.03 sTicker MSG has multiple PERMNOs: {93233.0: 251, 15649.0: 77}
[###################                     ] | 49% Completed | 126.94 sTicker THRM has multiple PERMNOs: {91623: 199, 79248: 168}
[#####################                   ] | 54% Completed | 138.04 sTicker DMRC has multiple PERMNOs: {92808.0: 250, 87482.0: 48}
[##############################          ] | 76% Completed | 192.14 sTicker NTI has multiple PERMNOs: {13509: 43, 77488: 3}
[##############################          ] | 76% Completed | 194.28 sTicker KCG has multiple PERMNOs: {13998: 139, 87131: 47}
[################################        ] | 81% Completed | 207.99 sTicker RKT has multiple PERMNOs: {80361.0: 342, 19577.0: 11}
[###################################     ] | 89% Completed | 224.03 sTicker HCC has multiple PERMNOs: 



[####                                    ] | 10% Completed | 26.71 ssTicker Q has multiple PERMNOs: {13911: 155, 85032: 96}
[####                                    ] | 11% Completed | 29.45 sTicker LEA has multiple PERMNOs: {93101.0: 216, 80422.0: 56}
[######                                  ] | 15% Completed | 41.20 sTicker THRX has multiple PERMNOs: {90423: 414, 22240: 9}
Ticker TBIO has multiple PERMNOs: {88471.0: 140, 17899.0: 39}
[###########                             ] | 28% Completed | 72.90 sTicker UA has multiple PERMNOs: {90979.0: 1022, 15980.0: 39}
[###############                         ] | 38% Completed | 94.22 sTicker MWA has multiple PERMNOs: {91234.0: 512, 91639.0: 18}
[###############                         ] | 39% Completed | 96.60 sTicker CATS has multiple PERMNOs: {16670.0: 85, 79156.0: 30}
[#################                       ] | 43% Completed | 106.86 sTicker THOR has multiple PERMNOs: {76100: 311, 18321: 8}
[#######################                 ] | 57

2025-03-29 17:01:24,272 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/distributed/comm/tcp.py", line 226, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/distributed/worker.py", line 1269, in heartbeat
    response = await retry_operation(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/lib/python3.11/site-packages/distributed/utils_comm.py", line 416, in retry_operation
    return await retry(
           ^^^^^^^^^^^^
  File "/opt/homebrew/lib/python3.11/site-packages/distributed/utils_comm.py", line 395, in retry
    return await coro()
           ^^^^^

KeyboardInterrupt: 

In [3]:
import pandas as pd
import glob
import os

# Define the folder containing the CSV files
folder_path = "../transaction_ars"  #data_untracked/raw/abnormal_returns/transaction_ars

# Create a list of CSV file paths matching the pattern "trans_ar_batch_*.csv"
csv_files = glob.glob(os.path.join(folder_path, "trans_ar_batch_*.csv"))
print("Number of CSV files found:", len(csv_files))

# Use a generator expression instead of a list to reduce memory overhead
combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
print("Combined DataFrame shape:", combined_df.shape)

# Save the concatenated DataFrame as a new CSV file
output_file = "all_transactions_final_final.csv"

Number of CSV files found: 60


  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  combined_df = pd.concat((pd.re

Combined DataFrame shape: (3171001, 47)


In [5]:
print(combined_df.shape)
combined_df

(3171001, 47)


Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,...,expected_ret,abnormal_ret,CAR_5_before,CAR_5_after,CAR_30_before,CAR_30_after,CAR_60_before,CAR_60_after,CAR_120_before,CAR_120_after
0,2712880,0001181431-12-021750,"Common Stock, par value $0.001 per share",2012-04-02,,S,0,,4375833.0,19.00,...,,,,,,,,,,
1,2814539,0001140361-12-019790,"Common Stock, par value $0.001 per share",2012-04-02,,P,0,,500.0,19.00,...,,,,,,,,,,
2,2792629,0001140361-12-019786,"Common Stock, par value $0.001 per share",2012-04-02,,P,0,,1250.0,19.00,...,,,,,,,,,,
3,2413548,0001209191-13-000536,Common Stock,2012-12-31,,S,0,,6198.0,32.26,...,0.025951,0.002348,0.026987,0.054558,,0.413317,,1.023549,,1.659151
4,2288923,0001209191-14-010802,"Common Stock, par value $0.001 per share",2014-02-13,,P,0,,2500.0,35.50,...,0.045591,0.000005,0.002861,-0.001790,0.078698,-0.002513,0.200667,-0.088025,0.283658,-0.332968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3170996,1296509,0001325533-17-000015,Common Stock,2017-04-11,,S,0,,25235.0,11.50,...,,,,,,,,,,
3170997,1296508,0001325533-17-000015,Common Stock,2017-04-11,,S,0,,26100.0,11.50,...,,,,,,,,,,
3170998,1296507,0001325533-17-000015,Common Stock,2017-04-10,,S,0,,806.0,11.57,...,,,,,,,,,,
3170999,1296505,0001325533-17-000015,Common Stock,2017-04-10,,S,0,,1911.0,11.57,...,,,,,,,,,,


In [6]:
combined_df['CAR_5_before'].isna().sum()

193950

In [8]:
combined_df.to_csv(output_file, index=False)
print("Saved concatenated DataFrame to", output_file)

Saved concatenated DataFrame to all_transactions_final_final.csv


In [None]:
t = pd.read_csv(output_file)
t.shape #(3171001, 47)

  t = pd.read_csv(output_file)


TypeError: 'tuple' object is not callable

trans_ticker_price = df_batch[df_batch['ISSUERTRADINGSYMBOL'] == 'ESCA']
ticker_ar = all_ticker_ar_compare[(all_ticker_ar_compare['TICKER'] == 'ESCA')]

In [None]:
df_batch[df_batch['CAR_30_after'].isna()] #.sum()

Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,NUM_RPTOWNERCIK_;,RPTOWNERNAME_;,RPTOWNER_RELATIONSHIP_;,RPTOWNER_TITLE_#,clean_ticker,is_weird_ticker,PERMNO,date_x,VOL,PRC,RET,TICKER,date_y,actual_ret,b_mkt,risk_free_rate,expected_ret,abnormal_ret,CAR_5_before,CAR_5_after,CAR_30_before,CAR_30_after,CAR_60_before,CAR_60_after,CAR_120_before,CAR_120_after
787,2212256,0001225208-21-014176,Employee Stock Option (Right to Buy),2021-11-24,,M,0,,8903.0,0.00,D,0.0,D,,0.00,2021-11-29,2021-11-24,36104,,USB,1660311,1,,Officer,Vice Chair,USB,False,66157.0,2021-11-24,2862823.0,59.84,-0.003829,USB,2021-11-24,-0.003829,1.1425,0.06,-0.012925,0.009096,0.039199,,0.150918,,0.286097,,0.586275,
1628,5920282,0001225208-21-014176,"Common Stock, $0.01 par value",2021-11-24,,M,0,,8903.0,28.63,A,74232.0,D,,254892.89,2021-11-29,2021-11-24,36104,,USB,1660311,1,,Officer,Vice Chair,USB,False,66157.0,2021-11-24,2862823.0,59.84,-0.003829,USB,2021-11-24,-0.003829,1.1425,0.06,-0.012925,0.009096,0.039199,,0.150918,,0.286097,,0.586275,
8,7006,0000840489-21-000065,Options,2021-07-29,,M,0,,10000.0,0.00,D,0.0,D,,0.00,2021-07-30,2021-07-29,840489,"FIRSTCASH, INC",FCFS,1616549,1,Ramos Raul,Officer,SVP Latin American Operations,FCFS,False,76856.0,2021-07-29,68050.0,79.50,0.014807,FCFS,2021-07-29,0.014807,0.9006,0.06,0.019299,-0.004492,-0.028024,,-0.166381,,-0.280591,,-0.375024,
248,17208,0000840489-21-000065,Common Stock,2021-07-29,,M,0,,10000.0,38.00,A,54500.0,D,,380000.00,2021-07-30,2021-07-29,840489,"FIRSTCASH, INC",FCFS,1616549,1,Ramos Raul,Officer,SVP Latin American Operations,FCFS,False,76856.0,2021-07-29,68050.0,79.50,0.014807,FCFS,2021-07-29,0.014807,0.9006,0.06,0.019299,-0.004492,-0.028024,,-0.166381,,-0.280591,,-0.375024,
151,2894945,0001181431-12-060066,Common Stock,2012-11-19,,P,0,,50008.0,4.07,A,2341085.0,I,Held in Trust,203532.56,2012-11-21,2012-11-19,1102741,"STEC, INC.",STEC,1133785,1,MOSHAYEDI MANOUCH,DirectorOther,,STEC,False,88646.0,2012-11-19,745001.0,4.13,0.0,STEC,2012-11-19,0.000000,1.3792,0.09,-0.034128,0.034128,0.186745,0.120735,1.278663,,2.344930,,4.259942,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,5857638,0001437749-21-022061,Common Stock,2021-09-13,,S,0,,15000.0,11.00,D,9277722.0,I,By Acuitas,165000.00,2021-09-13,2021-09-09,1136174,"Ontrak, Inc.",OTRK,1797168;904534,2,"Acuitas Group Holdings, LLC;PEIZER TERREN S","TenPercentOwner;Director,Officer,TenPercentOwner",nan#Executive Chairman,OTRK,False,16670.0,2021-09-13,512187.0,11.11,0.012762,OTRK,2021-09-13,0.012762,0.5293,0.06,0.034997,-0.022235,-0.174246,-0.146051,0.055314,,0.705426,,1.556781,
30,5852974,0001437749-21-021729,Common Stock,2021-09-08,,S,0,,15000.0,11.12,D,9322722.0,I,By Acuitas,166800.00,2021-09-08,2021-09-03,1136174,"Ontrak, Inc.",OTRK,1797168;904534,2,"Acuitas Group Holdings, LLC;PEIZER TERREN S","TenPercentOwner;Director,Officer,TenPercentOwner",nan#Executive Chairman,OTRK,False,16670.0,2021-09-08,634442.0,11.22,-0.022648,OTRK,2021-09-08,-0.022648,0.5572,0.05,0.009521,-0.032169,-0.224643,-0.174246,0.337547,,1.002790,,1.689957,
31,5857637,0001437749-21-022061,Common Stock,2021-09-10,,S,0,,15000.0,11.14,D,9292722.0,I,By Acuitas,167100.00,2021-09-13,2021-09-09,1136174,"Ontrak, Inc.",OTRK,1797168;904534,2,"Acuitas Group Holdings, LLC;PEIZER TERREN S","TenPercentOwner;Director,Officer,TenPercentOwner",nan#Executive Chairman,OTRK,False,16670.0,2021-09-10,526840.0,10.97,-0.061591,OTRK,2021-09-10,-0.061591,0.5232,0.05,-0.008384,-0.053207,-0.217555,-0.201898,0.242730,,0.896584,,1.640489,
37,5857636,0001437749-21-022061,Common Stock,2021-09-09,,S,0,,15000.0,11.69,D,9307722.0,I,By Acuitas,175350.00,2021-09-13,2021-09-09,1136174,"Ontrak, Inc.",OTRK,1797168;904534,2,"Acuitas Group Holdings, LLC;PEIZER TERREN S","TenPercentOwner;Director,Officer,TenPercentOwner",nan#Executive Chairman,OTRK,False,16670.0,2021-09-09,420966.0,11.69,0.041889,OTRK,2021-09-09,0.041889,0.5284,0.04,0.040998,0.000891,-0.199567,-0.182224,0.323373,,0.979481,,1.710951,
