In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# ========== Step 1: 全部字段 ==========
cols = [
    'DCN','TRANDATE','SEQNUM','PERSONID','OWNER','ROLECODE1','ROLECODE2','ROLECODE3','ROLECODE4',
    'ADDRESS1','ADDRESS2','CITY','STATE','ZIPCODE','COUNTRY','PHONE','CNAME','CNUM',
    'CUSIP6','CUSIP2','TICKER','SECID','SECTOR','INDUSTRY','FORMTYPE','ACQDISP','OPTIONSELL',
    'OWNERSHIP','SHARESHELD','SHARESHELD_ADJ','SHARES','SHARES_ADJ','TPRICE','TPRICE_ADJ',
    'TRANCODE','SECTITLE','AMEND','CLEANSE','FDATE','CDATE','MAINTDATE','SECDATE','SIGDATE',
    'TRANDATE_AR','ACQDISP_AR','TPRICE_AR','TRANCODE_AR','gap_days','SEC_Business_Day',
    'SEC_Business_Day_Lag2','delay','id'
]

# ========== Step 2: 设置字段类型（节省内存） ==========
category_cols = [
    'ROLECODE1','ROLECODE2','ROLECODE3','ROLECODE4','ACQDISP','OPTIONSELL','OWNERSHIP',
    'TRANCODE','SECTITLE','FORMTYPE','TICKER','STATE','COUNTRY','TRANCODE_AR','ACQDISP_AR'
]
date_cols = [
    'TRANDATE','FDATE','CDATE','MAINTDATE','SECDATE','SIGDATE',
    'TRANDATE_AR','SEC_Business_Day','SEC_Business_Day_Lag2'
]
numeric_cols = [
    'SEQNUM','SHARESHELD','SHARESHELD_ADJ','SHARES','SHARES_ADJ',
    'TPRICE','TPRICE_ADJ','gap_days','delay','SEC_Business_Day','SEC_Business_Day_Lag2'
]

dtype_dict = {col: 'category' for col in category_cols}
dtype_dict.update({col: 'float32' for col in numeric_cols})
dtype_dict.update({'id': 'int8'})  # 标签





In [None]:
# ========== Step 3: 读取 CSV ==========
print("📥 正在读取数据，请稍候...")
#df = pd.read_csv("identify_delay.csv", usecols=cols, dtype=dtype_dict, parse_dates=date_cols, low_memory=False)
#df = pd.read_csv("identify_delay.csv", usecols=cols, dtype=dtype_dict, parse_dates=date_cols, low_memory=False, encoding='ISO-8859-1')
df = pd.read_csv(
    "identify_delay.csv",
    usecols=cols,
    dtype=dtype_dict,  # 不包含日期列
    parse_dates=date_cols,  # 把这些字段解析为 datetime
    encoding='ISO-8859-1',  # 或你刚才尝试成功的编码
    low_memory=False
)

print (df)



In [None]:
# ========== Step 4: 编码类别变量 ==========
print("🔁 编码类别特征...")
for col in category_cols:
    df[col] = df[col].astype('str').fillna('missing')
    df[col] = LabelEncoder().fit_transform(df[col])



In [None]:
# ========== Step 5: 时间特征提取 ==========
print("🕰️ 提取时间特征...")
for col in date_cols:
    df[f"{col}_year"] = df[col].dt.year
    df[f"{col}_month"] = df[col].dt.month
    df[f"{col}_day"] = df[col].dt.day
    df[f"{col}_weekday"] = df[col].dt.weekday
    df.drop(columns=col, inplace=True)



In [None]:
# ========== Step 6: 选择特征和标签 ==========
df = df.drop(columns=[
    'DCN','PERSONID','OWNER','ADDRESS1','ADDRESS2','CITY','ZIPCODE','PHONE','CNAME','CNUM',
    'CUSIP6','CUSIP2','SECID','SECTOR','INDUSTRY','AMEND','CLEANSE'
])  # 去掉非结构化或ID字段

X = df.drop(columns=['id'])
y = df['id']

# ========== Step 7: 划分数据集 ==========
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

import numpy as np

# 1. 标签是否只有一种
print("y_train counts:", np.bincount(y_train))
print("y_test counts:", np.bincount(y_test))

# 2. 训练集和测试集是否重合
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Intersection:", np.intersect1d(X_train.values, X_test.values).shape)






In [None]:
# ========== Step 8: 训练 XGBoost ==========
print("⚙️ 训练 XGBoost 模型...")
#model = xgb.XGBClassifier(
#    objective='binary:logistic',
#    eval_metric='logloss',
#    use_label_encoder=False,
#    tree_method='hist',
#    max_depth=12,
#    learning_rate=0.01,
#    n_estimators=100,
 #   random_state=42
#)
"""
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    tree_method='hist',  # or 'gpu_hist' if GPU available
    max_depth=9,
    learning_rate=0.005,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.2,
    min_child_weight=5,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42
)
"""

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',                # 更适合二分类
    use_label_encoder=False,
    tree_method='hist',              # 或 'gpu_hist'（推荐，有GPU就用）
    
    max_depth=10,                    # 可以适当大些
    min_child_weight=10,             # 避免过拟合
    gamma=0.8,                       # 剪枝阈值
    subsample=0.8,                   # 行采样
    colsample_bytree=0.8,            # 特征采样

    learning_rate=0.1,              # 学习率适中
    n_estimators=3000,                # 多些迭代次数配合较小学习率

    reg_alpha=0.1,                   # L1 正则
    reg_lambda=1.0,                  # L2 正则

    random_state=42
)


model.fit(X_train, y_train)

# ========== Step 9: 评估模型 ==========
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("📊 分类报告:")
print(classification_report(y_test, y_pred))
print("🎯 AUC 分数:", roc_auc_score(y_test, y_prob))

# ========== Step 10: 特征重要性 ==========
xgb.plot_importance(model, max_num_features=20, height=0.5)
plt.title("Feature Importance")
plt.tight_layout()
plt.show()