In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import optuna
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [20]:
dataset_df = pd.read_csv('train.csv')
dataset_df['EJ'] = dataset_df['EJ'].replace({'A':0, 'B':1})
dataset_df = dataset_df.iloc[:,1:]

In [38]:
# 计算特征的均值
mean_values = dataset_df.mean()
# 填充缺失值为特征的均值
dataset_df = dataset_df.fillna(mean_values)
max_values = dataset_df.max()
min_values = dataset_df.min()

In [21]:
feature_columns = dataset_df.columns[:-1]  # 假设最后一列为目标变量，不需要处理

# 针对每个特征列，将无穷大值替换为前 97% 大的值
for column in feature_columns:
    # 计算前 97% 大的值
    percentile = 0.98
    threshold = dataset_df[column].quantile(percentile)
    
    # 将无穷大值替换为前 97% 大的值
    dataset_df[column] = dataset_df[column].replace(np.inf, threshold)

In [22]:
# 计算特征的均值
mean_values = dataset_df.mean()
# 填充缺失值为特征的均值
dataset_df = dataset_df.fillna(mean_values)

In [23]:
# 分割特征和类别
X = dataset_df.iloc[:, :-1]  # 特征
y = dataset_df.iloc[:, -1]   # 类别
X = X.dropna()  # 删除包含缺失值的行
y = y[X.index]  # 保持与特征对应的类别

# 对非数值类型特征进行独热编码
X_encoded = pd.get_dummies(X)

# 初始化特征选择器
k = 50  # 选择前k个重要特征
selector = SelectKBest(score_func=mutual_info_classif, k=k)

# 特征选择
X_selected = selector.fit_transform(X_encoded, y)

# 获取选择的特征索引
selected_feature_indices = selector.get_support(indices=True)


# 获取选择的特征名称
selected_features = X_encoded.columns[selected_feature_indices]
# selected_features = selected_features[:-2].append(pd.Index(['EJ']))
# 输出选择的特征
print(selected_features)

Index(['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'BC', 'BD ', 'BN', 'BP', 'BQ', 'BR',
       'BZ', 'CC', 'CD ', 'CF', 'CL', 'CR', 'CS', 'CW ', 'DA', 'DE', 'DF',
       'DH', 'DI', 'DN', 'DU', 'DV', 'DY', 'EB', 'EE', 'EG', 'EH', 'EJ', 'EL',
       'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL', 'FR', 'FS', 'GB', 'GE', 'GF',
       'GH', 'GI', 'GL'],
      dtype='object')


In [24]:
X_train, y_train = dataset_df.loc[:, selected_features], dataset_df.iloc[:, -1]

In [31]:
params_net = {
    'hidden_layer_sizes': (200,50), 
    'activation': 'relu', 
    'solver': 'adam', 
    'alpha': 0.004524225053160557, 
    'batch_size': 'auto', 
    'learning_rate': 'constant', 
    'learning_rate_init': 0.0032695885785495216, 
    'power_t': 0.41229208337868917, 
    'max_iter': 391, 
    'shuffle': True, 
    'random_state': None, 
    'tol': 0.0002505082147304683, 
    'verbose': False, 
    'warm_start': False, 
    'momentum': 0.8862529058691623, 
    'nesterovs_momentum': True, 
    'early_stopping': True, 
    'validation_fraction': 0.1250369526043429, 
    'beta_1': 0.8337047263985674, 
    'beta_2': 0.9945669606589083, 
    'epsilon': 1.479694727954582e-09, 
    'n_iter_no_change': 19, 
    'max_fun': 16800
}

In [33]:
%%time

Models = []
best_acu = 0
best_threshold = 0
# 定义交叉验证的折数
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# 定义模型列表
models = [
    # lgb.LGBMClassifier(**params_lgb),
    # cb.CatBoostClassifier(**params_cat),
    # xgb.XGBClassifier(**params_xgb),
    # RandomForestClassifier(**params_RF),
    MLPClassifier(**params_net)
    # SVC(probability=True, random_state=42)  # 设置 probability=True 以输出概率
]

for train_index, valid_index in kf.split(X_train):
    
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    # 创建 StandardScaler 对象
    scaler = StandardScaler()
    # 对 DataFrame 进行标准化
    X_tr = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
    X_val = pd.DataFrame(scaler.fit_transform(X_val), columns=X_tr.columns)
    
    # 训练模型
    for model in models:
        model.fit(X_tr, y_tr)

    # 预测结果的概率值
    y_pred_proba_list = []
    for model in models:
#         if type(model).__name__ == 'MLPClassifier':
#             scaler = StandardScaler()
#             features_to_scale = X_val.columns
#             X_val[features_to_scale] = scaler.fit_transform(X_val[features_to_scale])
        
        
        y_pred_proba = model.predict_proba(X_val)[:, 1]  # 取第一列的概率值（正类的概率）
        y_pred_proba_list.append(y_pred_proba)
        
#     for i in range(len(y_pred_proba_list)):
#         thresholds = np.linspace(0, 1, 100)
#         acu = 0
#         thred = 0
#         for threshold in thresholds:
#             y_pred = np.where(y_pred_proba_list[i] >= threshold, 1, 0)
#             accuracy = np.mean(y_pred == y_val)
#             if accuracy > acu:
#                 acu = accuracy
#                 thred = threshold
#                 print('acu=', acu)
#         y_pred_proba_list[i] = [map_values(j, thred=thred) for j in y_pred_proba_list[i]]
            

    # 模型融合（平均概率值）
    y_pred_ensemble_proba = np.mean(y_pred_proba_list, axis=0)
    
    #选取最佳阈值
    thresholds = np.linspace(0, 1, 100)
    temp_acu = 0
    temp_thred = 0
    for threshold in thresholds:
        # 计算最佳阈值
        y_pred_ensemble = np.where(y_pred_ensemble_proba >= threshold, 1, 0)
        accuracy = np.mean(y_pred_ensemble == y_val)
        if accuracy >= temp_acu:
            temp_acu = accuracy
            temp_thred = threshold
            temp_y_pred = y_pred_ensemble
    print(classification_report(y_val, temp_y_pred))
    print(roc_auc_score(y_val, y_pred_ensemble_proba))
    
    # 保存最优模型和最优阈值
    if temp_acu >= best_acu:
        Models = models
        best_acu = temp_acu
        best_threshold = temp_thred
print(best_acu)
print(best_threshold)



              precision    recall  f1-score   support

           0       0.94      0.97      0.96       101
           1       0.85      0.74      0.79        23

    accuracy                           0.93       124
   macro avg       0.90      0.85      0.87       124
weighted avg       0.93      0.93      0.93       124

0.9233749461902712




              precision    recall  f1-score   support

           0       0.92      0.97      0.95        98
           1       0.86      0.69      0.77        26

    accuracy                           0.91       124
   macro avg       0.89      0.83      0.86       124
weighted avg       0.91      0.91      0.91       124

0.9631083202511774




              precision    recall  f1-score   support

           0       0.94      0.96      0.95       107
           1       0.71      0.62      0.67        16

    accuracy                           0.92       123
   macro avg       0.83      0.79      0.81       123
weighted avg       0.91      0.92      0.92       123

0.898948598130841




              precision    recall  f1-score   support

           0       0.88      0.98      0.93       101
           1       0.80      0.36      0.50        22

    accuracy                           0.87       123
   macro avg       0.84      0.67      0.71       123
weighted avg       0.86      0.87      0.85       123

0.8654365436543654




              precision    recall  f1-score   support

           0       0.94      0.96      0.95       102
           1       0.79      0.71      0.75        21

    accuracy                           0.92       123
   macro avg       0.87      0.84      0.85       123
weighted avg       0.92      0.92      0.92       123

0.9215686274509803
0.9274193548387096
0.6565656565656566
CPU times: total: 266 ms
Wall time: 1.19 s




In [40]:
test_df = pd.read_csv('test.csv',index_col="Id")
# 填充缺失值为特征的均值
for i in range(len(test_df['EJ'])):
    if test_df['EJ'][i] != 'A' and test_df['EJ'][i] != 'B':
        test_df['EJ'][i] = 'A'
test_df['EJ'] = test_df['EJ'].replace({'A':0, 'B':1})
test_df = test_df.fillna(mean_values)
test_df.replace([np.inf], [np.nan], inplace=True)
test_df = test_df.fillna(max_values)
test_df.replace([-np.inf], [np.nan], inplace=True)
test_df = test_df.fillna(min_values)
test_df = test_df.loc[:, selected_features]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['EJ'][i] = 'A'


In [41]:
y_pred = []
for model in Models:
    y_pred_proba = model.predict_proba(test_df)[:, 1]  # 取第一列的概率值（正类的概率）
    y_pred.append(y_pred_proba)

# 模型融合（平均概率值）
y_pred = np.mean(y_pred, axis=0)
print(y_pred)

[0.00000000e+00 8.45284998e-39 1.92091643e-01 1.92091643e-01
 1.92091643e-01]


In [42]:
def map_values(value,thred=best_threshold):
    """根据阈值映射"""
    if value <= thred:
        return 0.5 * (value / thred)
    else:
        return 0.5 + 0.5 * ((value - thred) / 0.65)
y_pred = np.vectorize(map_values)(y_pred)
pred = np.stack((1-y_pred, y_pred), axis=1)
print(pred)

[[1.00000000e+00 0.00000000e+00]
 [1.00000000e+00 6.43717037e-39]
 [8.53714826e-01 1.46285174e-01]
 [8.53714826e-01 1.46285174e-01]
 [8.53714826e-01 1.46285174e-01]]


In [46]:
test = pd.read_csv('test.csv')
submission = pd.DataFrame(test["Id"], columns = ["Id"]);
submission["class_0"] = 1 - y_pred
submission["class_1"] = y_pred

submission.to_csv('submission.csv', index = None);
submission_df = pd.read_csv('submission.csv')

In [47]:
submission_df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,1.0,0.0
1,010ebe33f668,1.0,6.43717e-39
2,02fa521e1838,0.853715,0.1462852
3,040e15f562a2,0.853715,0.1462852
4,046e85c7cc7f,0.853715,0.1462852
