In [228]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import optuna
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [229]:
dataset_df = pd.read_csv('train.csv')
dataset_df['EJ'] = dataset_df['EJ'].replace({'A':0, 'B':1})
dataset_df = dataset_df.iloc[:,1:]

In [230]:
# 计算特征的均值、方差、标准差
mean_values = dataset_df.mean()
var_values = dataset_df.var()
std_values = dataset_df.std()
# 填充缺失值为特征的均值
dataset_df = dataset_df.fillna(mean_values)
max_values = dataset_df.max()
min_values = dataset_df.min()

In [231]:
# 分割特征和类别
X = dataset_df.iloc[:, :-1]  # 特征
y = dataset_df.iloc[:, -1]   # 类别
# X = X.dropna()  # 删除包含缺失值的行
y = y[X.index]  # 保持与特征对应的类别

# 对非数值类型特征进行独热编码
X_encoded = pd.get_dummies(X)

# 初始化特征选择器
k = 50  # 选择前k个重要特征
selector = SelectKBest(score_func=mutual_info_classif, k=k)

# 特征选择
X_selected = selector.fit_transform(X_encoded, y)

# 获取选择的特征索引
selected_feature_indices = selector.get_support(indices=True)

# 获取选择的特征名称
selected_features = X_encoded.columns[selected_feature_indices]
# selected_features = selected_features[:-2].append(pd.Index(['EJ']))
# 输出选择的特征
print(selected_features)

Index(['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'BC', 'BD ', 'BN', 'BP', 'BQ', 'BR',
       'BZ', 'CC', 'CD ', 'CF', 'CR', 'CS', 'CW ', 'DA', 'DE', 'DF', 'DH',
       'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB', 'EE', 'EG', 'EH', 'EJ', 'EL',
       'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL', 'FR', 'FS', 'GB', 'GE', 'GF',
       'GH', 'GI', 'GL'],
      dtype='object')


In [232]:
X_train, y_train = dataset_df.loc[:, selected_features], dataset_df.iloc[:, -1]
var_values = X_train.var()
std_values = X_train.std()

In [233]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X = np.array(X_train, dtype=np.float32)
y = np.array(y_train, dtype=np.float32).reshape(-1, 1)

# 转换为 PyTorch 的 Tensor 数据类型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [234]:
# 定义神经网络模型
seed = 2
torch.manual_seed(seed)
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 32)
        self.fc4 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.sigmoid(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        y = x
        x = self.sigmoid(self.fc4(x))
        return x, y

# 初始化神经网络
net_model = NeuralNetwork()

# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(net_model.parameters(), lr=0.001)

# 将数据转换为 DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 训练神经网络
num_epochs = 200
for epoch in range(num_epochs):
    net_model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs, layer = net_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# 测试神经网络
net_model.eval()
with torch.no_grad():
    y_pred, layer = net_model(X_test)
    y_pred_class = (y_pred >= 0.5).float()
    accuracy = (y_pred_class == y_test).float().mean()
    print(f"Test Accuracy: {accuracy.item():.4f}")

Epoch [1/200], Loss: 0.5722
Epoch [2/200], Loss: 0.4687
Epoch [3/200], Loss: 0.4467
Epoch [4/200], Loss: 0.4591
Epoch [5/200], Loss: 0.4532
Epoch [6/200], Loss: 0.4594
Epoch [7/200], Loss: 0.4555
Epoch [8/200], Loss: 0.4602
Epoch [9/200], Loss: 0.4380
Epoch [10/200], Loss: 0.4310
Epoch [11/200], Loss: 0.4202
Epoch [12/200], Loss: 0.4017
Epoch [13/200], Loss: 0.3797
Epoch [14/200], Loss: 0.3451
Epoch [15/200], Loss: 0.3095
Epoch [16/200], Loss: 0.2811
Epoch [17/200], Loss: 0.2587
Epoch [18/200], Loss: 0.2457
Epoch [19/200], Loss: 0.2319
Epoch [20/200], Loss: 0.2266
Epoch [21/200], Loss: 0.2174
Epoch [22/200], Loss: 0.2135
Epoch [23/200], Loss: 0.2073
Epoch [24/200], Loss: 0.2101
Epoch [25/200], Loss: 0.1963
Epoch [26/200], Loss: 0.2043
Epoch [27/200], Loss: 0.1980
Epoch [28/200], Loss: 0.1890
Epoch [29/200], Loss: 0.1893
Epoch [30/200], Loss: 0.1891
Epoch [31/200], Loss: 0.1855
Epoch [32/200], Loss: 0.1803
Epoch [33/200], Loss: 0.1869
Epoch [34/200], Loss: 0.1753
Epoch [35/200], Loss: 0

In [235]:
params_lgb = {
    'objective': 'binary',  # 二分类目标函数
    'metric': 'binary_logloss',  # 二分类损失函数
    'boosting_type': 'gbdt',  # 提升类型，可选参数：'gbdt', 'dart', 'goss'
    'num_leaves': 28,  # 叶子节点数量
    'learning_rate': 0.1,  # 学习率
    'feature_fraction': 0.85,  # 特征采样比例
    'bagging_fraction': 0.7,  # 数据采样比例
    'bagging_freq': 4,  # 数据采样频率
    'random_state': 42,
    'min_child_samples': 8,
    'verbose':-1, 
    'random_state':12
}
params_cat = {
    'iterations': 626,
    'learning_rate': 0.01,
    'depth': 4,
    'l2_leaf_reg': 0.27,
    'border_count': 66,
    'random_state': 12,
    'verbose': False
}
params_xgb = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.0356,
    'max_depth': 7,
    'subsample': 0.95,
    'colsample_bytree': 0.92,
    'gamma': 9e-08,
    'random_state': 12
}
params_RF = {
    'n_estimators': 300,
    'max_depth': 6,
    'min_samples_split': 2,
    'min_samples_leaf': 3,
    'max_features': 'auto',
    'random_state': 12
}
params_net = {
    'hidden_layer_sizes': (200,50), 
    'activation': 'relu', 
    'solver': 'adam', 
    'alpha': 0.004524225053160557, 
    'batch_size': 'auto', 
    'learning_rate': 'constant', 
    'learning_rate_init': 0.0032695885785495216, 
    'power_t': 0.41229208337868917, 
    'max_iter': 391, 
    'shuffle': True, 
    'random_state': None, 
    'tol': 0.0002505082147304683, 
    'verbose': False, 
    'warm_start': False, 
    'momentum': 0.8862529058691623, 
    'nesterovs_momentum': True, 
    'early_stopping': True, 
    'validation_fraction': 0.1250369526043429,      
    'beta_1': 0.8337047263985674, 
    'beta_2': 0.9945669606589083, 
    'epsilon': 1.479694727954582e-09, 
    'n_iter_no_change': 19, 
    'max_fun': 16800
}

In [236]:
X_train, y_train = dataset_df.loc[:, selected_features], dataset_df.iloc[:, -1]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train = np.array(X_train, dtype=np.float32)

# 转换为 PyTorch 的 Tensor 数据类型
X_train = torch.tensor(X_train, dtype=torch.float32)

In [237]:
_, X_train = net_model(X_train)
X_train = pd.DataFrame(X_train.detach().numpy())

In [238]:
%%time

Models = []
best_acu = 0
best_threshold = 0
# 定义交叉验证的折数
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# 定义模型列表
models = [
    lgb.LGBMClassifier(**params_lgb),
    #cb.CatBoostClassifier(**params_cat),
    xgb.XGBClassifier(**params_xgb),
    RandomForestClassifier(**params_RF),
    # MLPClassifier(**params_net)
    # SVC(probability=True, random_state=42)  # 设置 probability=True 以输出概率
]

for train_index, valid_index in kf.split(X_train):
    
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    # 训练模型
    for model in models:
        model.fit(X_tr, y_tr)

    # 预测结果的概率值
    y_pred_proba_list = []
    for model in models:
#         if type(model).__name__ == 'MLPClassifier':
#             scaler = StandardScaler()
#             features_to_scale = X_val.columns
#             X_val[features_to_scale] = scaler.fit_transform(X_val[features_to_scale])
        y_pred_proba = model.predict_proba(X_val)[:, 1]  # 取第一列的概率值（正类的概率）
        y_pred_proba_list.append(y_pred_proba)
        
#     for i in range(len(y_pred_proba_list)):
#         thresholds = np.linspace(0, 1, 100)
#         acu = 0
#         thred = 0
#         for threshold in thresholds:
#             y_pred = np.where(y_pred_proba_list[i] >= threshold, 1, 0)
#             accuracy = np.mean(y_pred == y_val)
#             if accuracy > acu:
#                 acu = accuracy
#                 thred = threshold
#                 print('acu=', acu)
#         y_pred_proba_list[i] = [map_values(j, thred=thred) for j in y_pred_proba_list[i]]
            

    # 模型融合（平均概率值）
    y_pred_ensemble_proba = np.mean(y_pred_proba_list, axis=0)
    
    #选取最佳阈值
    thresholds = np.linspace(0, 1, 100)
    temp_acu = 0
    temp_thred = 0
    for threshold in thresholds:
        # 计算最佳阈值
        y_pred_ensemble = np.where(y_pred_ensemble_proba >= threshold, 1, 0)
        accuracy = np.mean(y_pred_ensemble == y_val)
        if accuracy >= temp_acu:
            temp_acu = accuracy
            temp_thred = threshold
            temp_y_pred = y_pred_ensemble
    print(classification_report(y_val, temp_y_pred))
    
    # 保存最优模型和最优阈值
    if temp_acu >= best_acu:
        Models = models
        best_acu = temp_acu
        best_threshold = temp_thred
print(best_acu)
print(best_threshold)

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       101
           1       0.74      0.61      0.67        23

    accuracy                           0.89       124
   macro avg       0.83      0.78      0.80       124
weighted avg       0.88      0.89      0.88       124

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        98
           1       1.00      0.96      0.98        26

    accuracy                           0.99       124
   macro avg       0.99      0.98      0.99       124
weighted avg       0.99      0.99      0.99       124

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       107
           1       1.00      1.00      1.00        16

    accuracy                           1.00       123
   macro avg       1.00      1.00      1.00       123
weighted avg       1.00      1.00      1.00       123

              preci

In [249]:
test_df = pd.read_csv('test3.csv',index_col="Id")
# 填充缺失值为特征的均值
for i in range(len(test_df['EJ'])):
    if test_df['EJ'][i] != 'A' and test_df['EJ'][i] != 'B':
        test_df['EJ'][i] = 'A'
test_df['EJ'] = test_df['EJ'].replace({'A':0, 'B':1})
test_df = test_df.fillna(mean_values)
test_df.replace([np.inf], [np.nan], inplace=True)
test_df = test_df.fillna(max_values)
test_df.replace([-np.inf], [np.nan], inplace=True)
test_df = test_df.fillna(min_values)
test_df = test_df.loc[:, selected_features]

In [250]:
X_train, y_train = dataset_df.loc[:, selected_features], dataset_df.iloc[:, -1]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
test_df = scaler.transform(test_df)
# test_df = scaler.transform(test_df)
# test_df = (test_df - X_train.mean()) / X_train.var() + X_train.mean()

In [251]:
test_df = np.array(test_df, dtype=np.float32)

# 转换为 PyTorch 的 Tensor 数据类型
test_df = torch.tensor(test_df, dtype=torch.float32)
net_model.eval()
_, test_df = net_model(test_df)
test_df = pd.DataFrame(test_df.detach().numpy())

In [252]:
y_pred = []
for model in Models:
    y_pred_proba = model.predict_proba(test_df)[:, 1]  # 取第一列的概率值（正类的概率）
    y_pred.append(y_pred_proba)

# 模型融合（平均概率值）
y_pred = np.mean(y_pred, axis=0)
for i in range(len(y_pred)):
    print(y_pred[i], y_train[i])

0.9909859436075742 1
0.011915291943998181 0
0.019583076357815414 0
0.011915485125482501 0
0.9192141076579627 1
0.00901717936197727 0
0.013694717241554384 0
0.009018110652202241 0
0.009297493872659962 0
0.009017183957869754 0
0.8894745471496269 1
0.013960055875697958 0
0.013694717241554384 0
0.8756462064233949 1
0.017592845718902714 0
0.01679364197943872 0
0.023941209159791598 0
0.02590245371724774 0
0.01985395778385414 0
0.00929746211273547 0
0.013694717241554384 0
0.01443795474963539 0
0.016714005842316335 0
0.011915455646412249 0
0.014594862958506141 0
0.009017179659356288 0
0.00901718393724817 0
0.011915469516618843 0
0.013269210785987718 0
0.013694717241554384 0
0.013694717241554384 0
0.9868605575935661 1
0.8833726952977751 1
0.011771212946326602 0
0.00901718393724817 0
0.05464695504892151 0
0.9906611101815955 1
0.009017183950074099 0
0.06213674677952521 0
0.013694717241554384 0
0.00901718393724817 0
0.9909859635137422 1
0.009319068859152521 0
0.014486499949565154 0
0.0090171839372

In [246]:
def map_values(value,thred=best_threshold):
    """根据阈值映射"""
    if value <= thred:
        return 0.5 * (value / thred)
    else:
        return 0.5 + 0.5 * ((value - thred) / 0.65)
y_pred = np.vectorize(map_values)(y_pred)
pred = np.stack((1-y_pred, y_pred), axis=1)
print(pred)

[[0.50456333 0.49543667]
 [0.9942084  0.0057916 ]
 [0.99048137 0.00951863]
 ...
 [0.99536103 0.00463897]
 [0.99334349 0.00665651]
 [0.99561707 0.00438293]]


In [253]:
test = pd.read_csv('test3.csv')
submission = pd.DataFrame(test["Id"], columns = ["Id"]);
submission["class_0"] = 1 - y_pred
submission["class_1"] = y_pred

submission.to_csv('submission.csv', index = None);
submission_df = pd.read_csv('submission.csv')

In [254]:
submission_df

Unnamed: 0,Id,class_0,class_1
0,000ff2bfdfe9,0.009014,0.990986
1,007255e47698,0.988085,0.011915
2,013f2bd269f5,0.980417,0.019583
3,043ac50845d5,0.988085,0.011915
4,044fb8a146ec,0.080786,0.919214
...,...,...,...
612,fd3dafe738fd,0.985458,0.014542
613,fd895603f071,0.953510,0.046490
614,fd8ef6377f76,0.990456,0.009544
615,fe1942975e40,0.986305,0.013695
