In [28]:
import os
import shutil
import pandas as pd
from scipy.stats import norm

# Paths
data_dir = os.environ.get('DATA_DIR',
                          '/gpfs/home/zh283/StockPredictionDNN/Data')
raw_csv = os.path.join(data_dir, 'jkpfactor.csv.zip')
factor_xlsx = os.path.join(data_dir, 'factors_list.xlsx')
parquet_dir = os.path.join(data_dir, 'parquet')

# Load characteristic names
char_df = pd.read_excel(factor_xlsx)
char_names = char_df.loc[char_df['abr_jkp'].notna(), 'abr_jkp'].tolist()

# Column definitions
base_cols = [
    'me', 'size_grp', 'permno', 'eom', 'crsp_exchcd', 'ret_exc_lead1m',
    'ret_exc', 'ret'
]
weight_cols = ['w_ew', 'w_vw']
label_cols = ['ret_pct', 'ret_z', 'ret_invn']

# 1) Read and filter raw data
df = (pd.read_csv(raw_csv, parse_dates=[
    'eom'
]).dropna(subset=['ret', 'ret_exc', 'ret_exc_lead1m']).query(
    'crsp_shrcd in [10,11] and crsp_exchcd in [1,2,3]'))
# Cast types and select columns
df['eom'] = (df['eom'].dt.strftime('%Y%m%d').astype(int))
df[['permno', 'crsp_exchcd']] = (df[['permno', 'crsp_exchcd']].astype(int))
df = df[base_cols + char_names]
df = df.sort_values(['eom', 'permno']).reset_index(drop=True)

# 2) Impute raw characteristics by monthly median
grp_raw = df.groupby('eom')[char_names]
df[char_names] = df[char_names].fillna(grp_raw.transform('median'))
df = df.dropna(subset=char_names).reset_index(drop=True)

# 3) Compute cross-sectional weights and return stats
grp = df.groupby('eom')
df['w_ew'] = 10000.0 / grp['permno'].transform('count')
df['w_vw'] = 10000.0 * df['me'] / grp['me'].transform('sum')
ret_series = grp['ret_exc_lead1m']
ret_mean = ret_series.transform('mean')
ret_std = ret_series.transform('std')

df['ret_pct'] = (ret_series.rank(
    method='average').div(ret_series.transform('count') + 1))
# fixed inverse‐normal and z‐score
df['ret_z'] = (df['ret_exc_lead1m'] - ret_mean) / ret_std
df['ret_invn'] = norm.ppf(df['ret_pct'])

# 4) Compute characteristic transforms without suffix
grp_char = grp[char_names]
df_pct = grp_char.rank(method='average').div(grp_char.transform('count') + 1)
df_z = (df[char_names] - grp_char.transform('mean')).div(
    grp_char.transform('std'))
df_invn = df_pct.apply(norm.ppf)

# 2) Prepare static slice
df['year'] = df['eom'] // 10000
meta_cols = base_cols + weight_cols + label_cols + ['year']
df_static = df[meta_cols]

# 3) Build full variant DataFrames (capture them in a dict)
variant_dfs = {
    'raw': pd.concat([df_static, df[char_names]], axis=1),
    'pct': pd.concat([df_static, df_pct], axis=1),
    'z': pd.concat([df_static, df_z], axis=1),
    'invn': pd.concat([df_static, df_invn], axis=1),
}

# 4) Write each variant with per‑year partitioning
for variant, vdf in variant_dfs.items():
    out_dir = os.path.join(parquet_dir, variant)
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
    # this will create year=XXXX folders automatically
    vdf.to_parquet(out_dir,
                   partition_cols=['year'],
                   engine='pyarrow',
                   index=False)