In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import lightgbm as lgb
import catboost as cb
import xgboost as xgb

In [2]:
dataset_df = pd.read_csv('train.csv')
dataset_df['EJ'] = dataset_df['EJ'].replace({'A':0, 'B':1})
dataset_df = dataset_df.iloc[:,1:]

In [3]:
feature_columns = dataset_df.columns[:-1]  # 假设最后一列为目标变量，不需要处理

# 针对每个特征列，将无穷大值替换为前 97% 大的值
for column in feature_columns:
    # 计算前 97% 大的值
    percentile = 0.98
    threshold = dataset_df[column].quantile(percentile)
    
    # 将无穷大值替换为前 97% 大的值
    dataset_df[column] = dataset_df[column].replace(np.inf, threshold)

In [4]:
# 计算特征的均值
mean_values = dataset_df.mean()
# 填充缺失值为特征的均值
dataset_df = dataset_df.fillna(mean_values)

In [5]:
# 分割特征和类别
X = dataset_df.iloc[:, :-1]  # 特征
y = dataset_df.iloc[:, -1]   # 类别
X = X.dropna()  # 删除包含缺失值的行
y = y[X.index]  # 保持与特征对应的类别

# 对非数值类型特征进行独热编码
X_encoded = pd.get_dummies(X)

# 初始化特征选择器
k = 50  # 选择前k个重要特征
selector = SelectKBest(score_func=mutual_info_classif, k=k)

# 特征选择
X_selected = selector.fit_transform(X_encoded, y)

# 获取选择的特征索引
selected_feature_indices = selector.get_support(indices=True)

# 获取选择的特征名称
selected_features = X_encoded.columns[selected_feature_indices]
# selected_features = selected_features[:-2].append(pd.Index(['EJ']))
# 输出选择的特征
print(selected_features)

Index(['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'BC', 'BD ', 'BN', 'BP', 'BQ',
       'BR', 'CC', 'CD ', 'CF', 'CR', 'CS', 'CW ', 'DA', 'DE', 'DF', 'DH',
       'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB', 'EE', 'EG', 'EH', 'EJ', 'EL',
       'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL', 'FR', 'FS', 'GB', 'GE', 'GF',
       'GH', 'GI', 'GL'],
      dtype='object')


In [6]:
X_train, y_train = dataset_df.loc[:, selected_features], dataset_df.iloc[:, -1]

In [22]:
Models = []
best_acu = 0
best_threshold = 0
# 定义交叉验证的折数
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# 定义模型列表
models = [
    lgb.LGBMClassifier(verbose=-1, random_state=42),
    cb.CatBoostClassifier(logging_level='Silent', random_seed=42),
    xgb.XGBClassifier(verbosity=0, seed=42),
    RandomForestClassifier(random_state=42),
    SVC(probability=True, random_state=42)  # 设置 probability=True 以输出概率
]

for train_index, valid_index in kf.split(X_train):
    
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    # 训练模型
    for model in models:
        model.fit(X_tr, y_tr)

    # 预测结果的概率值
    y_pred_proba_list = []
    for model in models:
        y_pred_proba = model.predict_proba(X_val)[:, 1]  # 取第一列的概率值（正类的概率）
        y_pred_proba_list.append(y_pred_proba)

    # 模型融合（平均概率值）
    y_pred_ensemble_proba = np.mean(y_pred_proba_list, axis=0)
    
    #选取最佳阈值
    thresholds = np.linspace(0, 1, 100)
    temp_acu = 0
    temp_thred = 0
    for threshold in thresholds:
        # 计算最佳阈值
        y_pred_ensemble = np.where(y_pred_ensemble_proba >= threshold, 1, 0)
        accuracy = np.mean(y_pred_ensemble == y_val)
        if accuracy >= temp_acu:
            temp_acu = accuracy
            temp_thred = threshold
            temp_y_pred = y_pred_ensemble
    print(classification_report(y_val, temp_y_pred))
    
    # 保存最优模型和最优阈值
    if temp_acu >= best_acu:
        Models = models
        best_acu = temp_acu
        best_threshold = temp_thred
print(best_acu)
print(best_threshold)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       101
           1       0.95      0.91      0.93        23

    accuracy                           0.98       124
   macro avg       0.97      0.95      0.96       124
weighted avg       0.98      0.98      0.98       124

              precision    recall  f1-score   support

           0       0.97      0.98      0.97        98
           1       0.92      0.88      0.90        26

    accuracy                           0.96       124
   macro avg       0.94      0.93      0.94       124
weighted avg       0.96      0.96      0.96       124

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       107
           1       0.82      0.88      0.85        16

    accuracy                           0.96       123
   macro avg       0.90      0.92      0.91       123
weighted avg       0.96      0.96      0.96       123

              preci

In [23]:
test_df = pd.read_csv('test.csv')
test_df = test_df.loc[:, selected_features]
test_df['EJ'] = test_df['EJ'].replace({'A':0, 'B':1})

In [24]:
y_pred = []
for model in Models:
    y_pred_proba = model.predict_proba(test_df)[:, 1]  # 取第一列的概率值（正类的概率）
    y_pred.append(y_pred_proba)

# 模型融合（平均概率值）
y_pred = np.mean(y_pred, axis=0)
print(y_pred)

[0.39831778 0.39831778 0.39831778 0.39831778 0.39831778]


In [25]:
def map_values(value,thred=best_threshold):
    """根据阈值映射"""
    if value <= thred:
        return 0.5 * (value / thred)
    else:
        return 0.5 + 0.5 * ((value - thred) / 0.65)
y_pred = np.vectorize(map_values)(y_pred)
pred = np.stack((1-y_pred, y_pred), axis=1)
print(pred)

[[0.42670194 0.57329806]
 [0.42670194 0.57329806]
 [0.42670194 0.57329806]
 [0.42670194 0.57329806]
 [0.42670194 0.57329806]]


In [26]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission[['class_0', 'class_1']] = pred
sample_submission.to_csv('submission.csv', index=False)