In [1]:
import pandas as pd
import os
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize
from scipy.stats import ttest_1samp
from scipy.stats import kurtosis, skew

In [2]:
# Define the input directory
input_directory = '/work/pi_atreya_chakraborty_umb_edu/Captsone/Data'

# Define the input file paths
clean_analyst_path_weekly = os.path.join(input_directory, 'clean_analyst_weekly.dta')
clean_analyst_path_monthly = os.path.join(input_directory, 'clean_analyst_monthly.dta')
clean_crsp_path = os.path.join(input_directory, 'clean_crsp.dta')

# Read the datasets
results_df_weekly = pd.read_stata(clean_analyst_path_weekly)
results_df_monthly = pd.read_stata(clean_analyst_path_monthly)
crsp_data = pd.read_stata(clean_crsp_path)

results_df_weekly = results_df_weekly.groupby(['cusip', 'estimid', 'year', 'month', 'week']).first().reset_index()
results_df_monthly = results_df_monthly.groupby(['cusip', 'estimid', 'year', 'month']).first().reset_index()

# Convert 'ireccd' to numeric, forcing errors to NaN (if any)
results_df_monthly['ireccd'] = pd.to_numeric(results_df_monthly['ireccd'], errors='coerce')

# Convert 'ireccd' to numeric, forcing errors to NaN (if any)
results_df_weekly['ireccd'] = pd.to_numeric(results_df_monthly['ireccd'], errors='coerce')

In [7]:
# Step 1: Group by estimid, year, month, and count unique firms (CUSIPs) per month
broker_month_count = results_df_monthly.groupby(['estimid', 'year', 'month'])['cusip'].nunique().reset_index(name='firm_count')

# Step 2: Check for at least 10 unique firms every month for 10 consecutive years
def has_10_firms_consecutive_months(data, num_years=10):
    # Sort by year and month
    data = data.sort_values(by=['year', 'month'])
    
    # Check for each rolling window of 120 consecutive months (10 years)
    for i in range(len(data) - num_years * 12 + 1):
        window = data.iloc[i:i + num_years * 12]
        
        # Check if there are at least 10 firms in every month of this window
        if all(window['firm_count'] >= 10):
            return True
    
    return False

# Step 3: Filter brokers with at least 10 firms every consecutive month for 10 years
brokers_with_10_years = broker_month_count.groupby('estimid').apply(lambda x: has_10_firms_consecutive_months(x, 10)).reset_index(name='qualifies')
brokers_with_10_years = brokers_with_10_years[brokers_with_10_years['qualifies']]  # Keep only brokers meeting the criteria
qualifying_brokers = brokers_with_10_years['estimid'].tolist()  # List of brokers that qualify


  brokers_with_10_years = broker_month_count.groupby('estimid').apply(lambda x: has_10_firms_consecutive_months(x, 10)).reset_index(name='qualifies')


In [8]:
print(qualifying_brokers)

['ADAMS', 'ARGUS', 'ATLANEQU', 'AVONDALE', 'BACHE', 'BAKER', 'BARRING', 'BEAR', 'BERN', 'BLAIR', 'BREAN', 'BRILEY', 'BTIG', 'BUCK', 'BURNS', 'CANACCOR', 'CANTORFZ', 'CAPELEUR', 'CLKA', 'CLUSA', 'CRAIG', 'CRUTTEN', 'DAVIDSON', 'EDWARDS', 'EVERCO', 'FAHN', 'FBOSTON', 'FIRSTALB', 'FRCLAYSC', 'FRIEDMAN', 'GABELLCO', 'GARTNER', 'GHUNTER', 'GKM', 'GOLDMAN', 'GUGGEL', 'HALLUM', 'HILLIARD', 'JANNEY', 'JEFFEREG', 'JOHNRICE', 'JOLSON', 'JPMORGAN', 'KAUFBRO', 'KEEFE', 'LADENBUR', 'LAWRENCE', 'LAZARD', 'LEERINK', 'LEGG', 'LEHMAN', 'LONGBOW', 'MACQUARI', 'MAXIM', 'MCDONALD', 'MCLEOD', 'MERRILL', 'MIDEST', 'MIZUSEC', 'MKEEGAN', 'MKMPARTN', 'MONTSEC', 'MORGAN', 'NEEDHAM', 'NORTHLAN', 'OLMSTEAD', 'OPPEN', 'PACCREST', 'PACGROW', 'PICKERIN', 'PIPER', 'POINT', 'RAYMOND', 'RBCDOMIN', 'ROBINSON', 'SANDLER', 'SCOAST', 'SCOTT', 'SIDOTI', 'STEPHENS', 'STERNE', 'STIFEL', 'SUMMCAP', 'SUSQUEH', 'TDSI', 'THEBENCH', 'UNTERBUR', 'VANKASPR', 'VIRGINIA', 'WEISEL', 'WHEAT', 'WMJM', 'WOLFE', 'WOODGUND']


In [5]:
# Step 4: Process `crsp_data` and merge it with `monthly_rec`
crsp_data = crsp_data.sort_values(by=['cusip', 'dlycaldt'])
crsp_data['year_month'] = crsp_data['dlycaldt'].dt.to_period('M')
crsp_data['monthly_avg_cap'] = crsp_data.groupby(['cusip', 'year_month'])['dlycap'].transform('mean')
crsp_data['prior_month_avg_cap'] = crsp_data.groupby('cusip')['monthly_avg_cap'].shift()

crsp_monthly = crsp_data.groupby(['cusip', 'year', 'month']).agg(
    beginning_price=('dlyprc', 'first'),
    ending_price=('dlyprc', 'last'),
    cusip9=('cusip9', 'first'),
    prior_month_avg_cap=('prior_month_avg_cap', 'first'),
    month_std=('dlyprc', 'std')
).reset_index()

crsp_monthly['month_return'] = (crsp_monthly['ending_price'] - crsp_monthly['beginning_price']) / crsp_monthly['beginning_price'] * 100
crsp_monthly['month_return_winsorized'] = winsorize(crsp_monthly['month_return'], limits=[0.01, 0.01])
crsp_monthly['month_std_winsorized'] = winsorize(crsp_monthly['month_std'], limits=[0.01, 0.01])

# Step 5: Assign periods based on the year
def assign_period(year):
    if 1992 <= year <= 1999:
        return 1
    elif 2000 <= year <= 2009:
        return 2
    elif 2010 <= year <= 2019:
        return 3
    elif 2020 <= year <= 2024:
        return 4
    else:
        return None

crsp_monthly['period'] = crsp_monthly['year'].apply(assign_period)

In [9]:
# Initialize an empty list to hold dataframes for each broker
broker_dfs = []

# Loop through each qualifying broker and run the analysis
for broker_id in qualifying_brokers:
    
    # Step 1: Filter results_df_monthly for the current broker
    broker_df = results_df_monthly[results_df_monthly['estimid'] == broker_id].copy()

    # Step 2: Recode 'ireccd' to imply more favorable recommendations
    broker_df['ireccd'] = 6 - broker_df['ireccd']

    # Step 3: Obtain the recommendations for each firm by the current broker (not the average)
    monthly_rec = broker_df.groupby(['cusip', 'year', 'month']).agg(
        avg_ireccd=('ireccd', 'first'),
        sic=('sic', 'first'),
        ff_5=('ff_5', 'first'),
        ff_10=('ff_10', 'first'),
        ff_17=('ff_17', 'first'),
        ff_48=('ff_48', 'first')
    ).reset_index()

    # Step 6: Merge `monthly_rec` with `crsp_monthly`
    rec_return = pd.merge(monthly_rec, crsp_monthly[['cusip', 'year', 'month', 'month_return', 'month_return_winsorized', 'month_std', 'month_std_winsorized', 'cusip9', 'prior_month_avg_cap']], 
                          on=['cusip', 'year', 'month'], 
                          how='left')

    # Step 7: Add the `broker` column (the `estimid` of the current broker)
    rec_return['broker'] = broker_id

    # Step 8: Append the dataframe to the list
    broker_dfs.append(rec_return)

# Step 9: Concatenate all broker dataframes into one final dataframe
final_df = pd.concat(broker_dfs, ignore_index=True)

# Step 10: Merge with additional datasets if needed and export
# File paths
ff_momentum_path = f"{input_directory}/FF_Momentum_Monthly.csv"
ff_str_path = f"{input_directory}/FF_STR_Monthly.csv"
ff3_path = f"{input_directory}/FF3_Monthly.csv"
ff5_path = f"{input_directory}/FF5_Monthly.csv"
q_factor_path = f"{input_directory}/q_factor.csv"

# Load additional datasets
ff_momentum = pd.read_csv(ff_momentum_path)
ff_str = pd.read_csv(ff_str_path)
ff3 = pd.read_csv(ff3_path)
ff5 = pd.read_csv(ff5_path)
q_factor = pd.read_csv(q_factor_path)

# Add year and month columns to each file
def add_year_month(df, col_name='Year-Month'):
    df['year'] = df[col_name].astype(str).str[:4].astype(int)
    df['month'] = df[col_name].astype(str).str[4:].astype(int)
    return df

ff_momentum = add_year_month(ff_momentum)
ff_str = add_year_month(ff_str)
ff3 = add_year_month(ff3)
ff5 = add_year_month(ff5)

# Merge the four datasets into final_df using lowercase year and month
final_df = pd.merge(final_df, ff_momentum, on=['year', 'month'], how='left')
final_df = pd.merge(final_df, ff_str, on=['year', 'month'], how='left')
final_df = pd.merge(final_df, ff3, on=['year', 'month'], how='left', suffixes=('', '_ff3'))
final_df = pd.merge(final_df, ff5, on=['year', 'month'], how='left', suffixes=('', '_ff5'))
final_df = pd.merge(final_df, q_factor, on=['year', 'month'], how='left', suffixes=('', '_q'))

# Export the final result
final_df.to_stata("rec_return_broker.dta")


/tmp/ipykernel_792221/3106360696.py:71: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    Year-Month_x   ->   Year_Month_x
    Mom      ->   Mom___
    Year-Month_y   ->   Year_Month_y
    Year-Month   ->   Year_Month
    Mkt-RF   ->   Mkt_RF
    Year-Month_ff5   ->   Year_Month_ff5
    Mkt-RF_ff5   ->   Mkt_RF_ff5

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

  final_df.to_stata("rec_return_broker.dta")
