In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, ttest_1samp
import matplotlib.pyplot as plt
import warnings
from scipy.interpolate import PchipInterpolator
warnings.filterwarnings("ignore")

In [2]:
data_path = '../data/monthly/monthly.csv'
monthly_raw_data = pd.read_csv(data_path)

# 将 RETX、ewretd 转为数值型
monthly_raw_data['RETX'] = pd.to_numeric(monthly_raw_data['RETX'], errors='coerce')
monthly_raw_data['ewretd'] = pd.to_numeric(monthly_raw_data['ewretd'], errors='coerce')

# 用 (RETX - ewretd) 得到“超额收益”
monthly_raw_data['RETX'] = monthly_raw_data['RETX'] - monthly_raw_data['ewretd']

# 去除 NaN 以及RETX为C或者B的行
cleaned_data = monthly_raw_data.dropna(subset=['RETX'])
shape_before = monthly_raw_data.shape
cleaned_data = cleaned_data[~cleaned_data['RETX'].isin(['B','C'])]
shape_after = cleaned_data.shape
print(f"RETX为B或C的行有 {shape_before[0] - shape_after[0]} 行，已经被删除。")

# 转换日期
cleaned_data['date'] = pd.to_datetime(cleaned_data['date'])

# 保证每支股票 >= 85 个月的数据
stocks_with_enough_data = cleaned_data.groupby('PERMNO').filter(lambda x: len(x) >= 85)

# 按 (PERMNO, date) 排序
stocks_with_enough_data = stocks_with_enough_data.sort_values(['PERMNO','date']).reset_index(drop=True)
print(f"符合条件的股票数量: {stocks_with_enough_data['PERMNO'].nunique()}")

market_data = cleaned_data.groupby('date')['ewretd'].mean().reset_index()
#计算累计收益,累加计算不用累乘
market_data['cumulative_ewretd'] = market_data['ewretd'].cumsum()

RETX为B或C的行有 58583 行，已经被删除。
符合条件的股票数量: 5191


In [3]:
def calculate_cumulative_excess_returns(group):
    """
    对每只股票按日期顺序,按原文应为加和而非乘积
    """
    # group 已按日期排序
    group['cum_excess_return'] = group['RETX'].rolling(window=12, min_periods=12).sum()
    return group

stocks_with_enough_data = (
    stocks_with_enough_data
    .groupby('PERMNO', group_keys=False)
    .apply(calculate_cumulative_excess_returns)
)

In [3]:
import pandas as pd

# 定义起始和结束日期
start_date = pd.Timestamp('1931-01-01')
end_date = pd.Timestamp('1983-01-01')

# 49个1年的周期 (每年1月形成新的组合)
one_year_periods = pd.date_range(start=start_date, end=end_date, freq='AS')[:49]

# 10个5年的周期（每5年形成一次）
five_year_periods = [start_date + pd.DateOffset(years=5 * i) for i in range(10)]

# 16个3年的周期（每3年形成一次）
three_year_periods = [start_date + pd.DateOffset(years=3 * i) for i in range(16)]

# 24个2年的周期（1933年到1979年，只包含奇数年份）
two_year_odd_periods = [pd.Timestamp(f"{1933 + 2 * i}-01-01") for i in range(24)]

# 25个2年的周期（1932年到1980年，只包含偶数年份）
two_year_even_periods = [pd.Timestamp(f"{1932 + 2 * i}-01-01") for i in range(25)]

# 24个2年的周期 (deciles, 奇数年份)
two_year_deciles_odd_periods = [pd.Timestamp(f"{1933 + 2 * i}-01-01") for i in range(24)]

# 25个2年的周期 (deciles, 偶数年份)
two_year_deciles_even_periods = [pd.Timestamp(f"{1932 + 2 * i}-01-01") for i in range(25)]

# 输出结果检查
print("One-Year Periods:", one_year_periods)
print("\nFive-Year Periods:", five_year_periods)
print("\nThree-Year Periods:", three_year_periods)
print("\nTwo-Year Odd Periods:", two_year_odd_periods)
print("\nTwo-Year Even Periods:", two_year_even_periods)
print("\nTwo-Year Deciles Odd Periods:", two_year_deciles_odd_periods)
print("\nTwo-Year Deciles Even Periods:", two_year_deciles_even_periods)

# 合并所有的formation periods列表
all_formation_periods = (
    one_year_periods.tolist() +
    five_year_periods +
    three_year_periods +
    two_year_odd_periods +
    two_year_even_periods +
    two_year_deciles_odd_periods +
    two_year_deciles_even_periods
)

# 将所有日期排序
all_formation_periods = sorted(all_formation_periods)

# 打印合并后的formation periods
print("\nAll Formation Periods (Sorted):")
print(all_formation_periods)


One-Year Periods: DatetimeIndex(['1931-01-01', '1932-01-01', '1933-01-01', '1934-01-01',
               '1935-01-01', '1936-01-01', '1937-01-01', '1938-01-01',
               '1939-01-01', '1940-01-01', '1941-01-01', '1942-01-01',
               '1943-01-01', '1944-01-01', '1945-01-01', '1946-01-01',
               '1947-01-01', '1948-01-01', '1949-01-01', '1950-01-01',
               '1951-01-01', '1952-01-01', '1953-01-01', '1954-01-01',
               '1955-01-01', '1956-01-01', '1957-01-01', '1958-01-01',
               '1959-01-01', '1960-01-01', '1961-01-01', '1962-01-01',
               '1963-01-01', '1964-01-01', '1965-01-01', '1966-01-01',
               '1967-01-01', '1968-01-01', '1969-01-01', '1970-01-01',
               '1971-01-01', '1972-01-01', '1973-01-01', '1974-01-01',
               '1975-01-01', '1976-01-01', '1977-01-01', '1978-01-01',
               '1979-01-01'],
              dtype='datetime64[ns]', freq='AS-JAN')

Five-Year Periods: [Timestamp('1931-01-01 00:0

In [18]:
# 定义formation periods
five_year_periods = [pd.Timestamp(f"{1930 + 5 * i}-01-01") for i in range(10)]
three_year_periods = [pd.Timestamp(f"{1930 + 3 * i}-01-01") for i in range(16)]
two_year_odd_periods_35 = [pd.Timestamp(f"{1933 + 2 * i}-01-01") for i in range(24)]
two_year_even_periods_35 = [pd.Timestamp(f"{1932 + 2 * i}-01-01") for i in range(25)]
two_year_odd_periods_82 = [pd.Timestamp(f"{1933 + 2 * i}-01-01") for i in range(24)]
two_year_even_periods_82 = [pd.Timestamp(f"{1932 + 2 * i}-01-01") for i in range(25)]
one_year_periods = [pd.Timestamp(f"{1931 + i}-01-01") for i in range(49)]

# 将所有formation periods合并
portfolio_starts = {
    "10 five-year periods": (five_year_periods, 50),
    "16 three-year periods": (three_year_periods, 35),
    "24 two-year odd periods (35 stocks)": (two_year_odd_periods_35, 35),
    "25 two-year even periods (35 stocks)": (two_year_even_periods_35, 35),
    "24 two-year odd periods (82 stocks)": (two_year_odd_periods_82, 82),
    "25 two-year even periods (82 stocks)": (two_year_even_periods_82, 82),
    "49 one-year periods": (one_year_periods, 35)
}

# 要计算T统计量的月份
selected_months = [1, 12, 13, 18, 24, 25, 36, 60]

# 存储所有结果
all_results = []

# 遍历不同的formation periods
for period_name, (periods, num_stocks) in portfolio_starts.items():
    all_periods = []
    for start_dt in periods:
        # 计算formation date和lookback区间
        formation_date = start_dt - pd.offsets.MonthEnd(1)
        
        # 确定formation period长度
        if "five-year" in period_name:
            lookback_start = formation_date - pd.DateOffset(months=59)
        elif "three-year" in period_name:
            lookback_start = formation_date - pd.DateOffset(months=35)
        elif "two-year" in period_name:
            lookback_start = formation_date - pd.DateOffset(months=23)
        else:  # one-year periods
            lookback_start = formation_date - pd.DateOffset(months=11)

        lookback_end = formation_date

        # 提取formation period的数据
        lookback_df = stocks_with_enough_data[
            (stocks_with_enough_data['date'] >= lookback_start) &
            (stocks_with_enough_data['date'] <= lookback_end)
        ].copy()
        if lookback_df.empty:
            continue

        # 计算每只股票的累积收益
        sub = lookback_df[lookback_df['date'] <= formation_date]
        portfolio_data = sub.groupby('PERMNO', group_keys=False).tail(1).copy()

        # 根据cum_excess_return排序选取赢家和输家
        portfolio_data = portfolio_data.sort_values('cum_excess_return', ascending=False)
        winner_ids = portfolio_data.head(num_stocks)['PERMNO']
        loser_ids = portfolio_data.tail(num_stocks)['PERMNO']

        # 设置持有期 (60个月)
        hold_start = start_dt
        hold_end = start_dt + pd.DateOffset(months=60) - pd.Timedelta(days=1)

        # 提取赢家和输家组合的持有期数据
        w_hold = stocks_with_enough_data[
            (stocks_with_enough_data['PERMNO'].isin(winner_ids)) &
            (stocks_with_enough_data['date'] >= hold_start) &
            (stocks_with_enough_data['date'] <= hold_end)
        ].copy()

        l_hold = stocks_with_enough_data[
            (stocks_with_enough_data['PERMNO'].isin(loser_ids)) &
            (stocks_with_enough_data['date'] >= hold_start) &
            (stocks_with_enough_data['date'] <= hold_end)
        ].copy()

        # 计算平均月收益
        w_monthly = w_hold.groupby('date')['RETX'].mean().reset_index(name='avg_u_w')
        l_monthly = l_hold.groupby('date')['RETX'].mean().reset_index(name='avg_u_l')

        # 合并赢家和输家数据
        merged = pd.merge(w_monthly, l_monthly, on='date', how='inner')
        merged['CAR_w'] = merged['avg_u_w'].cumsum()
        merged['CAR_l'] = merged['avg_u_l'].cumsum()
        merged['test_period_start'] = start_dt

        all_periods.append(merged)

    # 合并所有测试期的数据
    all_periods_df = pd.concat(all_periods, ignore_index=True)

    # 计算Winner, Loser, Difference, T-Statistic
    winner_portfolio = all_periods_df['CAR_w'].mean()
    loser_portfolio = all_periods_df['CAR_l'].mean()
    difference_car = loser_portfolio - winner_portfolio

    # 计算多个时间点的T统计量
    t_statistics = {}
    for t_ in selected_months:
        sub = all_periods_df[all_periods_df.index < t_]
        w_vals = sub['CAR_w'].dropna()
        l_vals = sub['CAR_l'].dropna()
        if len(w_vals) > 1 and len(l_vals) > 1:
            t_stat, _ = ttest_ind(w_vals, l_vals, nan_policy='omit')
        else:
            t_stat = np.nan
        t_statistics[f"T-Statistic (Month {t_})"] = t_stat

    # 将结果存储到表格中
    final_table = {
        "Length of Formation Period": period_name,
        "Average No. of Stocks": num_stocks,
        "Winner Portfolio": winner_portfolio,
        "Loser Portfolio": loser_portfolio,
        "Difference in CAR": difference_car,
        **t_statistics
    }
    all_results.append(final_table)

# 将结果转换为DataFrame
final_df = pd.DataFrame(all_results)

# 展示表格
print(final_df.to_string(index=False))



          Length of Formation Period  Average No. of Stocks  Winner Portfolio  Loser Portfolio  Difference in CAR  T-Statistic (Month 1)  T-Statistic (Month 12)  T-Statistic (Month 13)  T-Statistic (Month 18)  T-Statistic (Month 24)  T-Statistic (Month 25)  T-Statistic (Month 36)  T-Statistic (Month 60)
                10 five-year periods                     50            -0.147           -0.114              0.033                    NaN                  -4.138                  -4.194                  -4.508                  -4.396                  -4.174                  -1.761                  -3.483
               16 three-year periods                     35            -0.150           -0.123              0.027                    NaN                  -1.163                  -1.108                  -0.813                  -0.217                  -0.036                   1.812                   2.623
 24 two-year odd periods (35 stocks)                     35            -0.099        