In [12]:
factor1 = 'free_cash_flow'

In [None]:
import numpy as np
import pandas as pd

def process_stock_data(stock_data, columns_to_process):
    # 将 trade_month 转换为 datetime
    stock_data["trade_month"] = pd.to_datetime(stock_data["trade_month"])
    
    # 提取季度末数据
    stock_data["quarter"] = stock_data["trade_month"].dt.to_period("Q")
    
    # 创建一个映射：季度 -> 每列的第一个非空值（如 ZSCORE）
    for column in columns_to_process:
        # We first check if the column exists in the dataset
        if column not in stock_data.columns:
            print(f"Warning: Column '{column}' not found in data!")
            continue
        
        # 对每个 stock_code 进行分组处理
        for stock_code, group_data in stock_data.groupby("stock_code"):
            # Drop rows with missing values in the column and set 'trade_month' as the index
            column_data = group_data.dropna(subset=[column]).set_index("trade_month")
            
            # Create a mapping for the first non-null value of the column in each quarter
            quarter_column_map = column_data.groupby("quarter")[column].first()
            
            # Map the quarterly values to the group_data frame
            stock_data.loc[group_data.index, column] = group_data["quarter"].map(quarter_column_map)
            
            # Perform forward filling for any missing values in the column
            stock_data[column] = stock_data[column].fillna(method="ffill")
    
    return stock_data

# Example usage
# columns_to_process = ["ZSCORE", "factor2", "factor3"]  # Add other columns you want to process
# processed_data = process_stock_data(stock_data, columns_to_process)
path = r"C:\Users\Fisher Man\OneDrive\Desktop\Work Sheet\Py\Some Projects\Fin-Econ\Project\FF3\处理后数据2.0.dta"
panel_data = pd.read_stata(path)
panel_data.info()
labels = ["证券代码","交易月份","excess_return",'市场风险溢价因子流通市值加权','市值因子流通市值加权','账面市值比因子流通市值加权',"每股企业自由现金流量","EM"]
panel_data = panel_data[labels]
panel_data.rename(columns={
    '证券代码': 'stock_code',
    '交易月份': 'trade_month',
    '每股企业自由现金流量': 'free_cash_flow',
    # '市净率PB': 'pb_ratio',
    # 'return1': 'return',
    # 'RiskPremium': 'mkt',
    '市场风险溢价因子流通市值加权':'mkt',
    '市值因子流通市值加权':'smb',
    '账面市值比因子流通市值加权':'hml',
    'EM': 'em'
    # 'SMB': 'smb',
    # 'HML': 'hml'
}, inplace=True)
indicators = ['free_cash_flow','em']
panel_data= process_stock_data(panel_data,indicators)
panel_data.to_csv('panel_data.csv',index=False)
data = pd.read_csv('panel_data.csv')
# data.astype("float64")
print(data.info())
print(data.head())
data = data.dropna(subset='excess_return')  # Drop missing values
data.isnull().sum()


In [None]:
# Define the sorting factor
# factor1 = 'ZSCORE'  # Sorting factor

# Filter outliers for factor1
# data = filter_outliers(data, factor1)

# Drop missing values
# data = data.dropna()
print(data.shape)

# Initialize containers for results
# regression_results = []
grouped_data = []

# Iterate through each trading_month
for trading_month, monthly_data in data.groupby('trade_month'):
    # Define the number of groups for sorting
    num_groups = 10

    # Sort by factor1 and split into groups
    monthly_data = monthly_data.sort_values(by=factor1).reset_index(drop=True)
    monthly_data['factor1_quantile'] = (np.floor(monthly_data.index / (len(monthly_data) / num_groups)) + 1).astype(int)
        # # Sort by factor2 and create quantile groups
        # group1 = group1.sort_values(by=factor2).reset_index(drop=True)
        # group1['factor2_quantile'] = (np.floor(group1.index / (len(group1) / num_groups)) + 1).astype(int)
        
        # for q2 in range(1, num_groups + 1):
        #     group2 = group1[group1['factor2_quantile'] == q2]
        #     print(f"Trading Month: {trading_month}, Factor1 Quantile: {q1}, Factor2 Quantile: {q2},shape: {group2.shape}")
    for q1 in range(1, num_groups + 1):
        group1 = monthly_data[monthly_data['factor1_quantile'] == q1]
        # print(f"Trading Month: {trading_month}, Factor1 Quantile: {q1}, shape: {group1.shape}")
        # group1 = group1.dropna()  # Drop missing values
        # print(f"after dropna, shape: {group1.shape}")
        # group1.to_stata('group1.dta')  # Save the data for this group
        # Perform regression
        print(group1.head())
        y = group1['excess_return'].mean()
        mkt = group1['mkt'].mean()
        smb = group1['smb'].mean()
        hml = group1['hml'].mean()
        # X = group1[['mkt', 'smb', 'hml']].mean()
        # reg_result = reg_m(y, X)
        # print(reg_result.params)
        # alpha = reg_result.params['const']  # Get the intercept (alpha)
        grouped_data.append(
            {
                'trading_month': trading_month,
                'factor1_quantile': q1,
                'avg_excess_return': y,
                'mkt': mkt,
                'smb': smb,
                'hml': hml
            }
        )
        # # Store results for this trading_month and group
        # regression_results.append({
        #     'trading_month': trading_month,
        #     'factor1_quantile': q1,
        #     'alpha': alpha
        # # })

    # Combine the data from the first and last quantiles
    first_quantile_data = monthly_data[monthly_data['factor1_quantile'] == 1]
    last_quantile_data = monthly_data[monthly_data['factor1_quantile'] == num_groups]
    # # 计算均值差异
    first_mean = first_quantile_data[['excess_return', 'mkt', 'smb', 'hml']].mean()
    last_mean = last_quantile_data[['excess_return', 'mkt', 'smb', 'hml']].mean()
    diff = last_mean - first_mean
    # 将差异结果添加到 grouped_data
    grouped_data.append({
        'trading_month': trading_month,
        'factor1_quantile': 11,  # 使用 26 表示差异组
        # 'factor2_quantile': 26,  # 对于这个特殊组也可以标记
        'avg_excess_return': diff['excess_return'],
        'mkt': diff['mkt'],
        'smb': diff['smb'],
        'hml': diff['hml']
    })
grouped_data_df = pd.DataFrame(grouped_data)
grouped_data_df.to_csv('grouped_data.csv', index=False)
# Convert results to DataFrame
# regression_results_df = pd.DataFrame(regression_results)

In [None]:
grouped_data_df = pd.read_csv('grouped_data.csv')
grouped_data_df.head()

In [None]:
import pandas as pd
import statsmodels.api as sm


# Perform regression for each factor1_quantile
def regress_ff3(group):
    y = group['avg_excess_return']
    X = group[['mkt', 'smb', 'hml']]
    X = sm.add_constant(X)  # Add constant for the intercept
    model = sm.OLS(y, X).fit()
    return {
        "factor1_quantile": group['factor1_quantile'].iloc[0],
        't-stat (alpha)': model.tvalues['const'],
        'p-value (alpha)': model.pvalues['const'],
        "alpha": model.params['const'],
        'p-value (mkt)': model.pvalues['mkt'],
        "mkt_coef": model.params['mkt'],
        'p-value (smb)': model.pvalues['smb'],
        "smb_coef": model.params['smb'],
        'p-value (hml)' : model.pvalues['hml'],
        "hml_coef": model.params['hml'],
        "r_squared": model.rsquared
    }

# Apply regression to each quantile group
results = grouped_data_df.groupby('factor1_quantile').apply(regress_ff3).apply(pd.Series)
results.to_csv("FCF_AT_results.csv")