In [1]:
import pandas as pd
import os
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize
from scipy.stats import ttest_1samp


In [2]:
# Define the input directory
input_directory = '/work/pi_atreya_chakraborty_umb_edu/Captsone/Data'

# Define the input file paths
clean_analyst_path_weekly = os.path.join(input_directory, 'clean_analyst_weekly.dta')
clean_analyst_path_monthly = os.path.join(input_directory, 'clean_analyst_monthly.dta')
clean_crsp_path = os.path.join(input_directory, 'clean_crsp.dta')

# Read the datasets
results_df_weekly = pd.read_stata(clean_analyst_path_weekly)
results_df_monthly = pd.read_stata(clean_analyst_path_monthly)
crsp_data = pd.read_stata(clean_crsp_path)

results_df_weekly = results_df_weekly.groupby(['cusip', 'estimid', 'year', 'month', 'week']).first().reset_index()
results_df_monthly = results_df_monthly.groupby(['cusip', 'estimid', 'year', 'month']).first().reset_index()

In [3]:
# Convert 'ireccd' to numeric, forcing errors to NaN (if any)
results_df_monthly['ireccd'] = pd.to_numeric(results_df_monthly['ireccd'], errors='coerce')

# Now, recode 'ireccd' so that higher values imply a more favorable recommendation
results_df_monthly['ireccd'] = 6 - results_df_monthly['ireccd']

# After recoding, proceed with the calculation of the average 'ireccd' as described earlier
monthly_rec = results_df_monthly.groupby(['cusip', 'year', 'month']).agg(
    avg_ireccd=('ireccd', 'mean'),
    sic=('sic', 'first'),
    ff_5=('ff_5', 'first'),
    ff_10=('ff_10', 'first'),
    ff_17=('ff_17', 'first'),
    ff_48=('ff_48', 'first')
).reset_index()

In [4]:
# Sort stocks into tertile portfolios based on the average 'ireccd'
# Sort the crsp_data by 'cusip' and 'dlycaldt'
crsp_data = crsp_data.sort_values(by=['cusip', 'dlycaldt'])

# Create a 'year_month' column to group by year and month
crsp_data['year_month'] = crsp_data['dlycaldt'].dt.to_period('M')

# Calculate the average daily capitalization for each month
crsp_data['monthly_avg_cap'] = crsp_data.groupby(['cusip', 'year_month'])['dlycap'].transform('mean')

# Shift the monthly average capitalization by one period (one month) within each 'cusip'
crsp_data['prior_month_avg_cap'] = crsp_data.groupby('cusip')['monthly_avg_cap'].shift()

# Aggregate data to get the beginning and ending prices, prior month's average capitalization, and standard deviation
crsp_monthly = crsp_data.groupby(['cusip', 'year', 'month']).agg(
    beginning_price=('dlyprc', 'first'),
    ending_price=('dlyprc', 'last'),
    cusip9=('cusip9', 'first'),  # Retain the first 'cusip9' value in each group
    prior_month_avg_cap=('prior_month_avg_cap', 'first'),  # Use the prior month's average 'dlycap'
    month_std=('dlyprc', 'std')  # Calculate standard deviation for the month
).reset_index()

# Calculate the monthly return
crsp_monthly['month_return'] = (crsp_monthly['ending_price'] - crsp_monthly['beginning_price']) / crsp_monthly['beginning_price'] * 100

# Winsorize the 'month_return' and 'month_std' columns at the 1% and 99% levels
crsp_monthly['month_return_winsorized'] = winsorize(crsp_monthly['month_return'], limits=[0.01, 0.01])
crsp_monthly['month_std_winsorized'] = winsorize(crsp_monthly['month_std'], limits=[0.01, 0.01])

# Define the function to assign periods based on the year
def assign_period(year):
    if 1992 <= year <= 1999:
        return 1
    elif 2000 <= year <= 2009:
        return 2
    elif 2010 <= year <= 2019:
        return 3
    elif 2020 <= year <= 2024:
        return 4
    else:
        return None  # or another value indicating out of range

# Apply the function to create the 'period' column
crsp_monthly['period'] = crsp_monthly['year'].apply(assign_period)

In [5]:
# Merge monthly_rec with crsp_monthly based on 'cusip', 'year', and 'month'
rec_return = pd.merge(monthly_rec, crsp_monthly[['cusip', 'year', 'month', 'month_return', 'month_return_winsorized', 'month_std', 'month_std_winsorized', 'cusip9', 'prior_month_avg_cap']], 
                    on=['cusip', 'year', 'month'], 
                    how='left')

In [6]:
# File paths
ff_momentum_path = f"{input_directory}/FF_Momentum_Monthly.csv"
ff_str_path = f"{input_directory}/FF_STR_Monthly.csv"
ff3_path = f"{input_directory}/FF3_Monthly.csv"
ff5_path = f"{input_directory}/FF5_Monthly.csv"
q_factor_path = f"{input_directory}/q_factor.csv"

# Function to split Year-Month into separate year and month columns
def add_year_month(df, col_name='Year-Month'):
    df['year'] = df[col_name].astype(str).str[:4].astype(int)
    df['month'] = df[col_name].astype(str).str[4:].astype(int)
    return df

# Load the datasets
ff_momentum = pd.read_csv(ff_momentum_path)
ff_str = pd.read_csv(ff_str_path)
ff3 = pd.read_csv(ff3_path)
ff5 = pd.read_csv(ff5_path)
q_factor = pd.read_csv(q_factor_path)

# Add year and month columns to each file
ff_momentum = add_year_month(ff_momentum)
ff_str = add_year_month(ff_str)
ff3 = add_year_month(ff3)
ff5 = add_year_month(ff5)

# Ensure rec_return dataframe has year and month columns
if 'year' not in rec_return.columns or 'month' not in rec_return.columns:
    rec_return['year'] = rec_return['date'].astype(str).str[:4].astype(int)
    rec_return['month'] = rec_return['date'].astype(str).str[5:7].astype(int)

# Merge the four datasets into rec_return using lowercase year and month
rec_return = pd.merge(rec_return, ff_momentum, on=['year', 'month'], how='left')
rec_return = pd.merge(rec_return, ff_str, on=['year', 'month'], how='left')

# Merge FF3_Monthly with suffix for overlapping columns
rec_return = pd.merge(rec_return, ff3, on=['year', 'month'], how='left', suffixes=('', '_ff3'))

# Merge FF5_Monthly with suffix for overlapping columns
rec_return = pd.merge(rec_return, ff5, on=['year', 'month'], how='left', suffixes=('', '_ff5'))

# Merge q_factor_monthly with suffix for overlapping columns
rec_return = pd.merge(rec_return, q_factor, on=['year', 'month'], how='left', suffixes=('', '_q'))


In [7]:
rec_return.columns

Index(['cusip', 'year', 'month', 'avg_ireccd', 'sic', 'ff_5', 'ff_10', 'ff_17',
       'ff_48', 'month_return', 'month_return_winsorized', 'month_std',
       'month_std_winsorized', 'cusip9', 'prior_month_avg_cap', 'Year-Month_x',
       'Mom   ', 'Year-Month_y', 'ST_Rev', 'Year-Month', 'Mkt-RF', 'SMB',
       'HML', 'RF', 'Year-Month_ff5', 'Mkt-RF_ff5', 'SMB_ff5', 'HML_ff5',
       'RMW', 'CMA', 'RF_ff5', 'R_F', 'R_MKT', 'R_ME', 'R_IA', 'R_ROE',
       'R_EG'],
      dtype='object')

In [8]:
rec_return.to_stata("rec_return.dta")

/tmp/ipykernel_877356/322917081.py:1: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    Year-Month_x   ->   Year_Month_x
    Mom      ->   Mom___
    Year-Month_y   ->   Year_Month_y
    Year-Month   ->   Year_Month
    Mkt-RF   ->   Mkt_RF
    Year-Month_ff5   ->   Year_Month_ff5
    Mkt-RF_ff5   ->   Mkt_RF_ff5

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

  rec_return.to_stata("rec_return.dta")
