In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import xgboost as xgb
import matplotlib.pyplot as plt

# ========== Step 1: 读取 CSV 文件 ==========
cols = [
    'DCN', 'TRANDATE', 'SEQNUM', 'PERSONID', 'OWNER', 'ROLECODE1', 'ROLECODE2', 'ROLECODE3', 'ROLECODE4',
    'ADDRESS1', 'ADDRESS2', 'CITY', 'STATE', 'ZIPCODE', 'COUNTRY', 'PHONE', 'CNAME', 'CNUM',
    'CUSIP6', 'CUSIP2', 'TICKER', 'SECID', 'SECTOR', 'INDUSTRY', 'FORMTYPE', 'ACQDISP', 'OPTIONSELL',
    'OWNERSHIP', 'SHARESHELD', 'SHARESHELD_ADJ', 'SHARES', 'SHARES_ADJ', 'TPRICE', 'TPRICE_ADJ',
    'TRANCODE', 'SECTITLE', 'AMEND', 'CLEANSE', 'FDATE', 'CDATE', 'MAINTDATE', 'SECDATE', 'SIGDATE',
    'TRANDATE_AR', 'ACQDISP_AR', 'TPRICE_AR', 'TRANCODE_AR', 'gap_days', 'SEC_Business_Day',
    'SEC_Business_Day_Lag2', 'delay', 'id'
]

# 设置字段类型
category_cols = [
    'ROLECODE1', 'ROLECODE2', 'ROLECODE3', 'ROLECODE4', 'ACQDISP', 'OPTIONSELL', 'OWNERSHIP',
    'TRANCODE', 'SECTITLE', 'FORMTYPE', 'TICKER', 'STATE', 'COUNTRY', 'TRANCODE_AR', 'ACQDISP_AR'
]
date_cols = [
    'TRANDATE', 'FDATE', 'CDATE', 'MAINTDATE', 'SECDATE', 'SIGDATE',
    'TRANDATE_AR', 'SEC_Business_Day', 'SEC_Business_Day_Lag2'
]
numeric_cols = [
    'SEQNUM', 'SHARESHELD', 'SHARESHELD_ADJ', 'SHARES', 'SHARES_ADJ',
    'TPRICE', 'TPRICE_ADJ', 'gap_days', 'delay'
]

dtype_dict = {col: 'category' for col in category_cols}
dtype_dict.update({col: 'float32' for col in numeric_cols})
dtype_dict.update({'id': 'int8'})  # 标签

df = pd.read_csv(
    "identify_delay.csv",
    usecols=cols,
    dtype=dtype_dict,
    parse_dates=date_cols,
    encoding='ISO-8859-1',
    low_memory=False
)

# ========== Step 2: 类别特征编码 ==========
for col in category_cols:
    df[col] = df[col].astype('str').fillna('missing')
    df[col] = LabelEncoder().fit_transform(df[col])

# ========== Step 3: 日期特征处理 ==========
for col in date_cols:
    df[f"{col}_year"] = df[col].dt.year
    df[f"{col}_month"] = df[col].dt.month
    df[f"{col}_day"] = df[col].dt.day
    df[f"{col}_weekday"] = df[col].dt.weekday
    df.drop(columns=col, inplace=True)

# ========== Step 4: 删除无用字段 ==========
df = df.drop(columns=[
    'DCN', 'PERSONID', 'OWNER', 'ADDRESS1', 'ADDRESS2', 'CITY', 'ZIPCODE', 'PHONE', 'CNAME', 'CNUM',
    'CUSIP6', 'CUSIP2', 'SECID', 'SECTOR', 'INDUSTRY', 'AMEND', 'CLEANSE'
])

# ========== Step 5: 处理缺失值 ==========
df.fillna(0, inplace=True)

# ========== Step 6: 数值特征标准化 ==========
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# ========== Step 7: 准备特征和标签 ==========
X = df.drop(columns=['id']).values.astype(np.float32)
y = df['id'].values.astype(np.float32)

# ========== Step 8: 构建 PyTorch 数据集和数据加载器 ==========
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X)
        self.y = torch.tensor(y).float()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建数据集和数据加载器
train_ds = TabularDataset(X_train, y_train)
train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)

# ========== Step 9: 定义注意力机制的神经网络 ==========
class AttentionNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.attn = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        h = F.relu(self.fc1(x))
        attn_weights = torch.sigmoid(self.attn(h))
        h_attn = h * attn_weights
        out = self.fc2(h_attn).squeeze(1)
        return h_attn, torch.sigmoid(out)

# ========== Step 10: 训练注意力模型并提取特征 ==========
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AttentionNet(input_dim=X.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

"""

# 训练模型
model.train()
for epoch in range(10):  # 训练10个epoch
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader, 0):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        _, outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if i % 100 == 99:    # 每100个小批量打印一次损失值
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0
"""
from sklearn.metrics import classification_report, roc_auc_score

model.train()
# 假设 train_loader 是您的训练数据加载器
for epoch in range(10):  # 训练10个epoch
    model.train()
    running_loss = 0.0
    all_labels = []
    all_predictions = []
    all_probs = []

    for i, (inputs, labels) in enumerate(train_loader, 0):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        
        _, outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        # 收集所有标签和预测结果
        probs = outputs.detach().cpu().numpy()
        predictions = (probs > 0.5).astype(int)
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions)
        all_probs.extend(probs)
        
        if i % 100 == 99:    # 每100个小批量打印一次损失值
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

    # 计算并打印本 epoch 的评估指标
    print(f"Epoch {epoch + 1} evaluation:")
    print(classification_report(all_labels, all_predictions, digits=4))
    auc_score = roc_auc_score(all_labels, all_probs)
    print(f"AUC: {auc_score:.4f}")



 


[1,   100] loss: 7.653
[1,   200] loss: 5.041
[1,   300] loss: 5.277
[1,   400] loss: 4.582
[1,   500] loss: 3.893
[1,   600] loss: 2.600
[1,   700] loss: 0.986
[1,   800] loss: 0.360
[1,   900] loss: 0.297
[1,  1000] loss: 0.240
[1,  1100] loss: 0.231
[1,  1200] loss: 0.265
[1,  1300] loss: 0.243
[1,  1400] loss: 0.235
[1,  1500] loss: 0.229
[1,  1600] loss: 0.218
[1,  1700] loss: 0.234
[1,  1800] loss: 0.222
[1,  1900] loss: 0.223
[1,  2000] loss: 0.227
[1,  2100] loss: 0.229
[1,  2200] loss: 0.225
[1,  2300] loss: 0.225
[1,  2400] loss: 0.207
[1,  2500] loss: 0.208
[1,  2600] loss: 0.208
[1,  2700] loss: 0.197
[1,  2800] loss: 0.209
[1,  2900] loss: 0.197
[1,  3000] loss: 0.213
[1,  3100] loss: 0.192
[1,  3200] loss: 0.188
[1,  3300] loss: 0.196
[1,  3400] loss: 0.190
[1,  3500] loss: 0.184
[1,  3600] loss: 0.219
[1,  3700] loss: 0.201
[1,  3800] loss: 0.209
[1,  3900] loss: 0.208
[1,  4000] loss: 0.212
[1,  4100] loss: 0.207
[1,  4200] loss: 0.191
[1,  4300] loss: 0.195
[1,  4400] 