In [23]:
import pandas as pd
import os
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize
from scipy.stats import ttest_1samp
from scipy.stats import kurtosis, skew

In [35]:
# Define the input directory
input_directory = '/work/pi_atreya_chakraborty_umb_edu/Captsone/Data'

# Define the input file paths
clean_analyst_path_weekly = os.path.join(input_directory, 'clean_analyst_weekly.dta')
clean_analyst_path_monthly = os.path.join(input_directory, 'clean_analyst_monthly.dta')
clean_crsp_path = os.path.join(input_directory, 'clean_crsp.dta')

# Read the datasets
results_df_weekly = pd.read_stata(clean_analyst_path_weekly)
results_df_monthly = pd.read_stata(clean_analyst_path_monthly)
crsp_data = pd.read_stata(clean_crsp_path)

results_df_weekly = results_df_weekly.groupby(['cusip', 'estimid', 'year', 'month', 'week']).first().reset_index()
results_df_monthly = results_df_monthly.groupby(['cusip', 'estimid', 'year', 'month']).first().reset_index()

# Convert 'ireccd' to numeric, forcing errors to NaN (if any)
results_df_monthly['ireccd'] = pd.to_numeric(results_df_monthly['ireccd'], errors='coerce')

# Convert 'ireccd' to numeric, forcing errors to NaN (if any)
results_df_weekly['ireccd'] = pd.to_numeric(results_df_monthly['ireccd'], errors='coerce')

In [36]:
import pandas as pd
from scipy.stats import skew, kurtosis

# Perform the transformation (6 - ireccd)
results_df_monthly['ireccd_transformed'] = 6 - results_df_monthly['ireccd']

# Define a function to calculate descriptive statistics, including Q1, Q3, kurtosis, and skewness
def get_descriptive_stats(group):
    return pd.Series({
        'mean': group.mean(),
        'median': group.median(),
        'std': group.std(),
        'min': group.min(),
        'max': group.max(),
        'count': group.count(),
        'skewness': skew(group, nan_policy='omit'),
        'kurtosis': kurtosis(group, nan_policy='omit'),
        'Q1': group.quantile(0.25),  # First quartile (25th percentile)
        'Q3': group.quantile(0.75)   # Third quartile (75th percentile)
    })

# Apply the descriptive statistics function to each industry in ff_17
descriptive_stats_by_ff_17 = results_df_monthly.groupby('ff_17')['ireccd_transformed'].apply(get_descriptive_stats).unstack()

# Calculate the overall descriptive statistics across all industries
overall_stats = get_descriptive_stats(results_df_monthly['ireccd_transformed'])
overall_stats_df = pd.DataFrame(overall_stats).T
overall_stats_df.index = ['Overall']

# Append the overall statistics at the bottom of the industry-specific statistics
final_stats_df = pd.concat([descriptive_stats_by_ff_17, overall_stats_df])

# Save the final dataframe with descriptive statistics to a CSV file
final_stats_df.to_csv("Industry_Descriptive_with_Quartiles.csv")


In [37]:
final_stats_df.head()

Unnamed: 0,mean,median,std,min,max,count,skewness,kurtosis,Q1,Q3
Cars,3.620951,4.0,0.958353,1.0,5.0,52447.0,-0.143646,-0.350782,3.0,4.0
Chems,3.685024,4.0,0.960687,1.0,5.0,183871.0,-0.293608,-0.188068,3.0,4.0
Clths,3.701181,4.0,0.933321,1.0,5.0,31427.0,-0.169168,-0.366749,3.0,4.0
Cnstr,3.67658,4.0,0.951167,1.0,5.0,87119.0,-0.192823,-0.353264,3.0,4.0
Cnsum,3.740731,4.0,0.93659,1.0,5.0,11949.0,-0.202912,-0.48533,3.0,5.0


In [27]:
# Now, recode 'ireccd' so that higher values imply a more favorable recommendation
results_df_monthly['ireccd'] = 6 - results_df_monthly['ireccd']

# After recoding, proceed with the calculation of the average 'ireccd' as described earlier
monthly_rec = results_df_monthly.groupby(['cusip', 'year', 'month']).agg(
    avg_ireccd=('ireccd', 'mean'),
    sic=('sic', 'first'),
    ff_5=('ff_5', 'first'),
    ff_10=('ff_10', 'first'),
    ff_17=('ff_17', 'first'),
    ff_48=('ff_48', 'first')
).reset_index()

In [28]:
# Sort stocks into tertile portfolios based on the average 'ireccd'
# Sort the crsp_data by 'cusip' and 'dlycaldt'
crsp_data = crsp_data.sort_values(by=['cusip', 'dlycaldt'])

# Create a 'year_month' column to group by year and month
crsp_data['year_month'] = crsp_data['dlycaldt'].dt.to_period('M')

# Calculate the average daily capitalization for each month
crsp_data['monthly_avg_cap'] = crsp_data.groupby(['cusip', 'year_month'])['dlycap'].transform('mean')

# Shift the monthly average capitalization by one period (one month) within each 'cusip'
crsp_data['prior_month_avg_cap'] = crsp_data.groupby('cusip')['monthly_avg_cap'].shift()

# Aggregate data to get the beginning and ending prices, prior month's average capitalization, and standard deviation
crsp_monthly = crsp_data.groupby(['cusip', 'year', 'month']).agg(
    beginning_price=('dlyprc', 'first'),
    ending_price=('dlyprc', 'last'),
    cusip9=('cusip9', 'first'),  # Retain the first 'cusip9' value in each group
    prior_month_avg_cap=('prior_month_avg_cap', 'first'),  # Use the prior month's average 'dlycap'
    month_std=('dlyprc', 'std')  # Calculate standard deviation for the month
).reset_index()

# Calculate the monthly return
crsp_monthly['month_return'] = (crsp_monthly['ending_price'] - crsp_monthly['beginning_price']) / crsp_monthly['beginning_price'] * 100

# Winsorize the 'month_return' and 'month_std' columns at the 1% and 99% levels
crsp_monthly['month_return_winsorized'] = winsorize(crsp_monthly['month_return'], limits=[0.01, 0.01])
crsp_monthly['month_std_winsorized'] = winsorize(crsp_monthly['month_std'], limits=[0.01, 0.01])


# Define the function to assign periods based on the year
def assign_period(year):
    if 1992 <= year <= 1999:
        return 1
    elif 2000 <= year <= 2009:
        return 2
    elif 2010 <= year <= 2019:
        return 3
    elif 2020 <= year <= 2024:
        return 4
    else:
        return None  # or another value indicating out of range

# Apply the function to create the 'period' column
crsp_monthly['period'] = crsp_monthly['year'].apply(assign_period)

In [29]:
# Merge monthly_rec with crsp_monthly based on 'cusip', 'year', and 'month'
rec_return = pd.merge(monthly_rec, crsp_monthly[['cusip', 'year', 'month', 'month_return', 'month_return_winsorized', 'month_std', 'month_std_winsorized', 'cusip9', 'prior_month_avg_cap']], 
                    on=['cusip', 'year', 'month'], 
                    how='left')

In [30]:
# File paths
ff_momentum_path = f"{input_directory}/FF_Momentum_Monthly.csv"
ff_str_path = f"{input_directory}/FF_STR_Monthly.csv"
ff3_path = f"{input_directory}/FF3_Monthly.csv"
ff5_path = f"{input_directory}/FF5_Monthly.csv"
q_factor_path = f"{input_directory}/q_factor.csv"

# Function to split Year-Month into separate year and month columns
def add_year_month(df, col_name='Year-Month'):
    df['year'] = df[col_name].astype(str).str[:4].astype(int)
    df['month'] = df[col_name].astype(str).str[4:].astype(int)
    return df

# Load the datasets
ff_momentum = pd.read_csv(ff_momentum_path)
ff_str = pd.read_csv(ff_str_path)
ff3 = pd.read_csv(ff3_path)
ff5 = pd.read_csv(ff5_path)
q_factor = pd.read_csv(q_factor_path)

# Add year and month columns to each file
ff_momentum = add_year_month(ff_momentum)
ff_str = add_year_month(ff_str)
ff3 = add_year_month(ff3)
ff5 = add_year_month(ff5)

# Ensure rec_return dataframe has year and month columns
if 'year' not in rec_return.columns or 'month' not in rec_return.columns:
    rec_return['year'] = rec_return['date'].astype(str).str[:4].astype(int)
    rec_return['month'] = rec_return['date'].astype(str).str[5:7].astype(int)

# Merge the four datasets into rec_return using lowercase year and month
rec_return = pd.merge(rec_return, ff_momentum, on=['year', 'month'], how='left')
rec_return = pd.merge(rec_return, ff_str, on=['year', 'month'], how='left')

# Merge FF3_Monthly with suffix for overlapping columns
rec_return = pd.merge(rec_return, ff3, on=['year', 'month'], how='left', suffixes=('', '_ff3'))

# Merge FF5_Monthly with suffix for overlapping columns
rec_return = pd.merge(rec_return, ff5, on=['year', 'month'], how='left', suffixes=('', '_ff5'))

# Merge q_factor_monthly with suffix for overlapping columns
rec_return = pd.merge(rec_return, q_factor, on=['year', 'month'], how='left', suffixes=('', '_q'))


In [33]:
rec_return.head()

Unnamed: 0,cusip,year,month,avg_ireccd,sic,ff_5,ff_10,ff_17,ff_48,month_return,...,HML_ff5,RMW,CMA,RF_ff5,R_F,R_MKT,R_ME,R_IA,R_ROE,R_EG
0,3605,2005,12,2.0,,Other,Other,Other,Other,,...,0.2,0.22,0.23,0.32,0.316,-0.2427,-0.0336,0.8804,0.2287,1.093
1,3605,2006,1,2.0,,Other,Other,Other,Other,,...,1.08,-0.65,-0.45,0.35,0.3488,3.0346,6.4413,-0.6797,-0.2611,0.8783
2,3605,2006,2,2.0,,Other,Other,Other,Other,,...,-0.34,-0.51,1.91,0.34,0.3322,-0.2924,-0.395,2.4381,-0.9978,-1.0257
3,3605,2006,3,2.0,,Other,Other,Other,Other,,...,0.6,0.06,-0.4,0.37,0.3642,1.4637,3.1221,-1.3435,0.7652,-0.7877
4,30710,2014,10,4.666667,6799.0,Other,Other,Finan,Meals,17.72973,...,-1.8,-0.56,-0.1,0.0,0.0012,2.5115,3.6151,0.5283,1.6899,-0.1556


In [34]:
# Ensure the winsorized columns are converted to numeric data type
rec_return['month_return_winsorized'] = pd.to_numeric(rec_return['month_return_winsorized'], errors='coerce')
rec_return['month_std_winsorized'] = pd.to_numeric(rec_return['month_std_winsorized'], errors='coerce')

rec_return.to_stata("rec_return.dta")

/tmp/ipykernel_1455484/2651743339.py:5: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    Year-Month_x   ->   Year_Month_x
    Mom      ->   Mom___
    Year-Month_y   ->   Year_Month_y
    Year-Month   ->   Year_Month
    Mkt-RF   ->   Mkt_RF
    Year-Month_ff5   ->   Year_Month_ff5
    Mkt-RF_ff5   ->   Mkt_RF_ff5

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

  rec_return.to_stata("rec_return_monthly_wins.dta")
