In [29]:
import numpy as np
import pandas as pd
import random
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA, FactorAnalysis
# from umap import UMAP

SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything()


In [30]:
train_features = pd.read_csv("train_features.csv")
train_targets_scored = pd.read_csv("train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("train_targets_nonscored.csv")

test_features = pd.read_csv("test_features.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [31]:
test_features

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_0004d9e33,trt_cp,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.5500,-0.1644,...,0.0981,0.7978,-0.1430,-0.2067,-0.2303,-0.1193,0.0210,-0.0502,0.1510,-0.7750
1,id_001897cda,trt_cp,72,D1,-0.1829,0.2320,1.2080,-0.4522,-0.3652,-0.3319,...,-0.1190,-0.1852,-1.0310,-1.3670,-0.3690,-0.5382,0.0359,-0.4764,-1.3810,-0.7300
2,id_002429b5b,ctl_vehicle,24,D1,0.1852,-0.1404,-0.3911,0.1310,-1.4380,0.2455,...,-0.2261,0.3370,-1.3840,0.8604,-1.9530,-1.0140,0.8662,1.0160,0.4924,-0.1942
3,id_00276f245,trt_cp,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.2020,...,0.1260,0.1570,-0.1784,-1.1200,-0.4325,-0.9005,0.8131,-0.1305,0.5645,-0.5809
4,id_0027f1083,trt_cp,48,D1,-0.3979,-1.2680,1.9130,0.2057,-0.5864,-0.0166,...,0.4965,0.7578,-0.1580,1.0510,0.5742,1.0900,-0.2962,-0.5313,0.9931,1.8380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,id_3ed6c0c13,trt_cp,72,D2,0.4541,0.2749,0.8474,-0.5490,-0.3524,0.7886,...,-0.1440,0.1395,0.1707,0.3869,-1.3230,-0.4282,0.3413,0.0445,-0.1214,-1.6150
954,id_3ef499e63,trt_cp,72,D2,-1.1400,0.4532,-0.4450,-0.0268,0.1831,-5.0180,...,-1.4650,-2.4840,-4.0950,-0.9907,-4.8580,-3.2600,-1.9690,-4.6570,-1.1170,-3.1420
955,id_3ef85167e,trt_cp,72,D2,0.2417,0.0796,0.4954,0.7824,0.7526,1.4580,...,-0.3425,0.4852,-1.6720,0.4376,-1.2310,-2.2870,-0.2977,0.2256,-3.9210,-0.7036
956,id_3f055749b,trt_cp,48,D1,-0.0521,-0.0140,-0.3340,-0.5663,0.0744,2.6880,...,-0.5446,-0.5403,-0.1951,0.7078,-0.6992,0.5541,-0.3041,0.0200,-0.3771,0.3169


In [32]:

train_features = train_features[train_features["cp_type"]=="trt_cp"].reset_index(drop=True)
train_targets_scored = train_targets_scored.loc[train_features.index].reset_index(drop=True)
train_targets_nonscored = train_targets_nonscored.loc[train_features.index].reset_index(drop=True)

In [33]:
train_features.isnull().sum()  # 看各列NaN的数量
# train_features.isnull().any(axis=1).sum()  # 多少行含有NaN
train_features = train_features.dropna(axis=0).reset_index(drop=True)
train_targets_scored = train_targets_scored.loc[train_features.index].reset_index(drop=True)
train_targets_nonscored = train_targets_nonscored.loc[train_features.index].reset_index(drop=True)

In [34]:
feature_cols = [col for col in train_features.columns if col.startswith("g-") or col.startswith("c-")]
X = train_features[feature_cols].values  # shape: [n_samples, n_features]

# 评分标签
y_scored = train_targets_scored.drop("sig_id", axis=1).values  # shape: [n_samples, 206]
# 非评分标签
y_nonscored = train_targets_nonscored.drop("sig_id", axis=1).values  # shape: [n_samples, 402] (数目可根据具体数据变)

In [35]:
X

array([[ 1.062 ,  0.5577, -0.2479, ...,  0.2139,  0.3801,  0.4176],
       [ 0.0743,  0.4087,  0.2991, ...,  0.1241,  0.6077,  0.7371],
       [ 0.628 ,  0.5817,  1.554 , ..., -0.2187, -1.408 ,  0.6931],
       ...,
       [-0.6972, -0.8153,  0.3242, ...,  0.4056, -0.3553, -0.8193],
       [-0.9304, -0.6327,  0.0197, ..., -0.5964,  0.9993, -0.5478],
       [-1.095 , -0.8086, -0.8571, ..., -0.0534,  0.464 ,  1.062 ]])

In [36]:
X.shape

(877, 872)

In [37]:
y_scored

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
qt = QuantileTransformer(n_quantiles=100, random_state=SEED, output_distribution='normal')
X_qt = qt.fit_transform(X)  


pca = PCA(n_components=50, random_state=SEED)
X_pca = pca.fit_transform(X_qt)  # shape [n_samples, 50]

In [39]:
X_final = np.concatenate([X_qt, X_pca], axis=1)  # shape [n_samples, original_dim + 50]

In [40]:
X_final.shape

(877, 922)

In [41]:
X_final

array([[ 1.1425129 ,  0.87262822, -0.38239122, ..., -0.5593062 ,
         0.01486416, -2.25217504],
       [ 0.13870258,  0.64378185,  0.24281529, ..., -0.64170544,
         0.53680972, -1.68745914],
       [ 0.82018079,  0.92736248,  1.31690966, ...,  2.66631344,
        -0.53244963,  1.04591216],
       ...,
       [-1.07940408, -0.89633557,  0.27231685, ...,  0.55847106,
        -1.41403368, -0.2887176 ],
       [-1.39857542, -0.69368274, -0.06841782, ..., -1.42376667,
         0.43874196, -0.54341053],
       [-1.64183315, -0.88132494, -1.21090611, ...,  0.45852192,
        -1.8917069 , -1.71052224]])

# nonscored prediction

In [42]:
class MoADataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return {
            "features": torch.tensor(self.X[idx], dtype=torch.float32),
            "labels": torch.tensor(self.y[idx], dtype=torch.float32)
        }

dataset_ns = MoADataset(X_final, y_nonscored)
loader_ns = DataLoader(dataset_ns, batch_size=256, shuffle=True, num_workers=2)


In [43]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=1024, dropout=0.2):
        super().__init__()
        self.bn0 = nn.BatchNorm1d(input_dim)
        self.dense1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dense2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.dense_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.bn0(x)
        x = F.relu(self.bn1(self.dense1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.dense2(x)))
        x = self.dropout(x)
        x = self.dense_out(x)
        return x


In [44]:
input_dim = X_final.shape[1]
output_dim_ns = y_nonscored.shape[1]  # 402
model_ns = SimpleNN(input_dim, output_dim_ns, hidden_dim=2048, dropout=0.3)
model_ns.cuda()  # 如果GPU可用

criterion = nn.BCEWithLogitsLoss()  
optimizer = torch.optim.Adam(model_ns.parameters(), lr=5e-3, weight_decay=1e-5)
# 你也可以用OneCycleLR或ReduceLROnPlateau之类调度器

epochs = 40
for epoch in range(epochs):
    model_ns.train()
    total_loss = 0
    for batch in loader_ns:
        features = batch["features"].cuda()
        labels = batch["labels"].cuda()
        
        optimizer.zero_grad()
        logits = model_ns(features)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * len(features)
    print(f"Epoch {epoch+1}, Loss = {total_loss / len(dataset_ns):.5f}")


Epoch 1, Loss = 0.22763
Epoch 2, Loss = 0.00654
Epoch 3, Loss = 0.00754
Epoch 4, Loss = 0.00735
Epoch 5, Loss = 0.00669
Epoch 6, Loss = 0.00611
Epoch 7, Loss = 0.00536
Epoch 8, Loss = 0.00470
Epoch 9, Loss = 0.00448
Epoch 10, Loss = 0.00436
Epoch 11, Loss = 0.00403
Epoch 12, Loss = 0.00378
Epoch 13, Loss = 0.00357
Epoch 14, Loss = 0.00353
Epoch 15, Loss = 0.00335
Epoch 16, Loss = 0.00318
Epoch 17, Loss = 0.00309
Epoch 18, Loss = 0.00293
Epoch 19, Loss = 0.00284
Epoch 20, Loss = 0.00270
Epoch 21, Loss = 0.00258
Epoch 22, Loss = 0.00247
Epoch 23, Loss = 0.00236
Epoch 24, Loss = 0.00218
Epoch 25, Loss = 0.00211
Epoch 26, Loss = 0.00200
Epoch 27, Loss = 0.00194
Epoch 28, Loss = 0.00185
Epoch 29, Loss = 0.00171
Epoch 30, Loss = 0.00145
Epoch 31, Loss = 0.00141
Epoch 32, Loss = 0.00115
Epoch 33, Loss = 0.00108
Epoch 34, Loss = 0.00092
Epoch 35, Loss = 0.00075
Epoch 36, Loss = 0.00063
Epoch 37, Loss = 0.00053
Epoch 38, Loss = 0.00045
Epoch 39, Loss = 0.00041
Epoch 40, Loss = 0.00044


In [45]:
model_ns.eval()
with torch.no_grad():
    loader_all = DataLoader(dataset_ns, batch_size=256, shuffle=False)
    preds_ns = []
    for batch in loader_all:
        f = batch["features"].cuda()
        logit = model_ns(f)
        prob = torch.sigmoid(logit)  # 将logits映射到0~1
        preds_ns.append(prob.cpu().numpy())
    meta_ns = np.concatenate(preds_ns, axis=0)  # shape [n_samples, 402]


In [46]:
qt_meta_ns = QuantileTransformer(n_quantiles=100, random_state=SEED, output_distribution='normal')
meta_ns_transformed = qt_meta_ns.fit_transform(meta_ns)

# Scored Prediction

In [47]:
# 拼接成新的特征
X_stage2 = np.concatenate([X_final, meta_ns_transformed], axis=1)
print(X_stage2.shape)  # [n_samples, 原始X_final维数 + 402(或该数目) ]


(877, 1324)


In [48]:
dataset_s2 = MoADataset(X_stage2, y_scored)
loader_s2 = DataLoader(dataset_s2, batch_size=256, shuffle=True, num_workers=2)


In [49]:
output_dim_scored = y_scored.shape[1]  # 206
model_s2 = SimpleNN(X_stage2.shape[1], output_dim_scored, hidden_dim=2048, dropout=0.3).cuda()

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_s2.parameters(), lr=5e-3, weight_decay=1e-5)

epochs = 57
for epoch in range(epochs):
    model_s2.train()
    total_loss = 0
    for batch in loader_s2:
        f = batch["features"].cuda()
        labels = batch["labels"].cuda()
        optimizer.zero_grad()
        logits = model_s2(f)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(f)
    print(f"Stage2 Epoch {epoch+1}, Loss = {total_loss / len(dataset_s2):.5f}")


Stage2 Epoch 1, Loss = 0.23521
Stage2 Epoch 2, Loss = 0.04252
Stage2 Epoch 3, Loss = 0.05069
Stage2 Epoch 4, Loss = 0.05024
Stage2 Epoch 5, Loss = 0.04642
Stage2 Epoch 6, Loss = 0.04098
Stage2 Epoch 7, Loss = 0.03504
Stage2 Epoch 8, Loss = 0.03056
Stage2 Epoch 9, Loss = 0.02700
Stage2 Epoch 10, Loss = 0.02383
Stage2 Epoch 11, Loss = 0.02203
Stage2 Epoch 12, Loss = 0.02032
Stage2 Epoch 13, Loss = 0.01896
Stage2 Epoch 14, Loss = 0.01832
Stage2 Epoch 15, Loss = 0.01773
Stage2 Epoch 16, Loss = 0.01755
Stage2 Epoch 17, Loss = 0.01695
Stage2 Epoch 18, Loss = 0.01637
Stage2 Epoch 19, Loss = 0.01580
Stage2 Epoch 20, Loss = 0.01534
Stage2 Epoch 21, Loss = 0.01459
Stage2 Epoch 22, Loss = 0.01382
Stage2 Epoch 23, Loss = 0.01315
Stage2 Epoch 24, Loss = 0.01254
Stage2 Epoch 25, Loss = 0.01168
Stage2 Epoch 26, Loss = 0.01080
Stage2 Epoch 27, Loss = 0.01007
Stage2 Epoch 28, Loss = 0.00952
Stage2 Epoch 29, Loss = 0.00867
Stage2 Epoch 30, Loss = 0.00922
Stage2 Epoch 31, Loss = 0.00792
Stage2 Epoch 32, 

In [50]:
model_s2.eval()
with torch.no_grad():
    loader_all = DataLoader(dataset_s2, batch_size=256, shuffle=False)
    preds_s2 = []
    for batch in loader_all:
        f = batch["features"].cuda()
        logit = model_s2(f)
        prob = torch.sigmoid(logit)
        preds_s2.append(prob.cpu().numpy())
    meta_scored = np.concatenate(preds_s2, axis=0)

# 可以进行裁剪:
meta_scored = np.clip(meta_scored, 0.0005, 0.9995)

# 再做变换
qt_meta_scored = QuantileTransformer(n_quantiles=100, random_state=SEED, output_distribution='normal')
meta_scored_transformed = qt_meta_scored.fit_transform(meta_scored)


# Stage 3

In [51]:
X_stage3 = meta_scored_transformed  # shape [n_samples, 206]

dataset_s3 = MoADataset(X_stage3, y_scored)
loader_s3 = DataLoader(dataset_s3, batch_size=256, shuffle=True)

model_s3 = SimpleNN(X_stage3.shape[1], output_dim_scored, hidden_dim=1024, dropout=0.3).cuda()
optimizer = torch.optim.Adam(model_s3.parameters(), lr=5e-3, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()

epochs = 40
for epoch in range(epochs):
    model_s3.train()
    total_loss = 0
    for batch in loader_s3:
        f = batch["features"].cuda()
        labels = batch["labels"].cuda()
        optimizer.zero_grad()
        logits = model_s3(f)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()*len(f)
    print(f"Stage3 Epoch {epoch+1}, Loss = {total_loss/len(dataset_s3):.5f}")

Stage3 Epoch 1, Loss = 0.27112
Stage3 Epoch 2, Loss = 0.02408
Stage3 Epoch 3, Loss = 0.02566
Stage3 Epoch 4, Loss = 0.02348
Stage3 Epoch 5, Loss = 0.02106
Stage3 Epoch 6, Loss = 0.01819
Stage3 Epoch 7, Loss = 0.01567
Stage3 Epoch 8, Loss = 0.01371
Stage3 Epoch 9, Loss = 0.01225
Stage3 Epoch 10, Loss = 0.01134
Stage3 Epoch 11, Loss = 0.01015
Stage3 Epoch 12, Loss = 0.00940
Stage3 Epoch 13, Loss = 0.00847
Stage3 Epoch 14, Loss = 0.00775
Stage3 Epoch 15, Loss = 0.00704
Stage3 Epoch 16, Loss = 0.00649
Stage3 Epoch 17, Loss = 0.00584
Stage3 Epoch 18, Loss = 0.00534
Stage3 Epoch 19, Loss = 0.00488
Stage3 Epoch 20, Loss = 0.00448
Stage3 Epoch 21, Loss = 0.00408
Stage3 Epoch 22, Loss = 0.00365
Stage3 Epoch 23, Loss = 0.00328
Stage3 Epoch 24, Loss = 0.00304
Stage3 Epoch 25, Loss = 0.00269
Stage3 Epoch 26, Loss = 0.00240
Stage3 Epoch 27, Loss = 0.00223
Stage3 Epoch 28, Loss = 0.00201
Stage3 Epoch 29, Loss = 0.00187
Stage3 Epoch 30, Loss = 0.00166
Stage3 Epoch 31, Loss = 0.00159
Stage3 Epoch 32, 

In [52]:
test_features.isnull().sum()  # 查看每个列 NaN 的数量
# 或者
test_features.isnull().any(axis=1).sum()  # 有多少行包含NaN


1

In [53]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer.fit(X)  # X是训练特征(无NaN或你准备保留的NaN)
X_imputed = imputer.transform(X)

# 再做后续QuantileTransformer/PCA
qt.fit(X_imputed)
X_qt = qt.transform(X_imputed)
pca.fit(X_qt)
X_pca = pca.transform(X_qt)

# 到了测试集：
X_test = test_features[feature_cols].values
X_test_imputed = imputer.transform(X_test)   # 先填充
X_test_qt = qt.transform(X_test_imputed)     # 再做QuantileTransformer
X_test_pca = pca.transform(X_test_qt)        # 再PCA
X_test_final = np.concatenate([X_test_qt, X_test_pca], axis=1)


# 2) 第一阶段非评分预测:
with torch.no_grad():
    ds_test_ns = MoADataset(X_test_final, np.zeros((len(X_test_final), output_dim_ns)))
    loader_test_ns = DataLoader(ds_test_ns, batch_size=256, shuffle=False)
    preds_ns_test = []
    for batch in loader_test_ns:
        f = batch["features"].cuda()
        logit = model_ns(f)
        prob = torch.sigmoid(logit)
        preds_ns_test.append(prob.cpu().numpy())
    meta_ns_test = np.concatenate(preds_ns_test, axis=0)

meta_ns_test_transformed = qt_meta_ns.transform(meta_ns_test)

# 3) 第二阶段评分预测:
X_test_s2 = np.concatenate([X_test_final, meta_ns_test_transformed], axis=1)
with torch.no_grad():
    ds_test_s2 = MoADataset(X_test_s2, np.zeros((len(X_test_s2), output_dim_scored)))
    loader_test_s2 = DataLoader(ds_test_s2, batch_size=256, shuffle=False)
    preds_s2_test = []
    for batch in loader_test_s2:
        f = batch["features"].cuda()
        logit = model_s2(f)
        prob = torch.sigmoid(logit)
        preds_s2_test.append(prob.cpu().numpy())
    meta_scored_test = np.concatenate(preds_s2_test, axis=0)
meta_scored_test = np.clip(meta_scored_test, 0.0005, 0.9995)
meta_scored_test_transformed = qt_meta_scored.transform(meta_scored_test)

# 4) 第三阶段最终预测:
with torch.no_grad():
    ds_test_s3 = MoADataset(meta_scored_test_transformed, np.zeros((len(X_test_s2), output_dim_scored)))
    loader_test_s3 = DataLoader(ds_test_s3, batch_size=256, shuffle=False)
    preds_s3_test = []
    for batch in loader_test_s3:
        f = batch["features"].cuda()
        logit = model_s3(f)
        prob = torch.sigmoid(logit)
        preds_s3_test.append(prob.cpu().numpy())
    final_preds = np.concatenate(preds_s3_test, axis=0)  # shape: [3982, 206] (测试集行数, MoA数量)


In [54]:
final_preds

array([[9.90546141e-07, 6.21463778e-06, 1.47959679e-06, ...,
        2.24587825e-06, 9.27587553e-06, 2.78992366e-06],
       [3.25582550e-05, 4.75103967e-04, 2.83110239e-05, ...,
        1.28190062e-04, 1.10012123e-04, 1.01850164e-04],
       [2.96579492e-05, 1.09634618e-03, 4.31172957e-05, ...,
        1.25993931e-04, 3.74426018e-04, 1.32093614e-04],
       ...,
       [6.72205761e-06, 4.26802981e-05, 7.29385374e-06, ...,
        1.62332781e-05, 7.95672458e-05, 2.95783364e-04],
       [1.47610976e-07, 3.30699208e-07, 1.80742944e-07, ...,
        7.73932788e-06, 5.55860561e-06, 2.66071470e-06],
       [3.71792958e-05, 1.86386483e-03, 4.51875203e-05, ...,
        6.17869155e-05, 4.41343553e-04, 2.48436088e-04]], dtype=float32)