根据根据 AMT_CREDIT、AMT_ANNUITY、TERM 自动解出真实利率,保存到之前建立的base里面

In [None]:
import os
import pandas as pd
import numpy as np
from numpy_financial import irr  # 如果没有这个库，可以 pip install numpy-financial

# 路径
train_app = pd.read_csv("../data/application_train.csv")
test_app  = pd.read_csv("../data/application_test.csv")

train_lgbm = pd.read_csv("../generated_feature/lgmb_CNT/lgbm_CNT_train.csv")
test_lgbm  = pd.read_csv("../generated_feature/lgmb_CNT/lgbm_CNT_test.csv")

base_train = pd.read_csv("../view data/base_train_ids.csv")
base_test  = pd.read_csv("../view data/base_test_ids.csv")


# ========== 构造核心特征 ==========
def compute_features(df):
    # 期数 term
    df["TERM"] = (df["AMT_CREDIT"] / df["AMT_ANNUITY"]).round().fillna(0).astype(int)
    
    # 真实利率 interest_rate，用 IRR 近似
    def calc_rate(row):
        credit = row["AMT_CREDIT"]
        annuity = row["AMT_ANNUITY"]
        term = row["TERM"]
        if term <= 0 or pd.isna(credit) or pd.isna(annuity) or annuity <= 0:
            return np.nan
        # 现金流：借入 credit（负数），还 term 次 annuity（正数）
        cashflows = [-credit] + [annuity] * term
        try:
            return irr(cashflows)
        except Exception:
            return np.nan

    df["INTEREST_RATE"] = df.apply(calc_rate, axis=1)
    return df[["SK_ID_CURR", "TERM", "INTEREST_RATE"]]

train_feat = compute_features(train_app)
test_feat  = compute_features(test_app)

# ========== 合并 lgbm_CNT ==========
train_feat = train_feat.merge(train_lgbm, on="SK_ID_CURR", how="left")
test_feat  = test_feat.merge(test_lgbm, on="SK_ID_CURR", how="left")

# ========== 加到 base 文件 ==========
base_train = base_train.merge(train_feat, on="SK_ID_CURR", how="left")
base_test  = base_test.merge(test_feat, on="SK_ID_CURR", how="left")




✅ base_train_ids.csv shape: (307511, 7)
✅ base_test_ids.csv  shape: (48744, 7)
   SK_ID_CURR  TERM  INTEREST_RATE  rate_goods  lgbm_CNT  rate_credit  \
0      100002    16  -3.323151e-03    0.062413        36     0.050405   
1      100003    36  -3.499234e-04    0.023980        60     0.018303   
2      100004    20  -2.220446e-16         NaN         0          NaN   
3      100006    11   7.303904e-03    0.029153        12     0.020657   
4      100007    23  -1.649606e-03    0.025260        36     0.025260   

   certainty  
0   0.753658  
1   0.998557  
2   0.999893  
3   0.997347  
4   0.850818  


In [5]:
# 保存
base_train.to_csv("../view data/CNT_train_ids.csv", index=False)
base_test.to_csv("../view data/CNT_test_ids.csv", index=False)

print("✅ base_train_ids.csv shape:", base_train.shape)
print("✅ base_test_ids.csv  shape:", base_test.shape)

print(base_train.head())

✅ base_train_ids.csv shape: (307511, 7)
✅ base_test_ids.csv  shape: (48744, 7)
   SK_ID_CURR  TERM  INTEREST_RATE  rate_goods  lgbm_CNT  rate_credit  \
0      100002    16  -3.323151e-03    0.062413        36     0.050405   
1      100003    36  -3.499234e-04    0.023980        60     0.018303   
2      100004    20  -2.220446e-16         NaN         0          NaN   
3      100006    11   7.303904e-03    0.029153        12     0.020657   
4      100007    23  -1.649606e-03    0.025260        36     0.025260   

   certainty  
0   0.753658  
1   0.998557  
2   0.999893  
3   0.997347  
4   0.850818  


In [6]:
print(base_train.shape)
print(base_test.shape)

(307511, 7)
(48744, 7)
