In [17]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, roc_curve
import warnings
from sklearn.preprocessing import LabelEncoder
import gc
import os
import time
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import features_selection as fs
import xgboost as xgb

# 设置中文字体，解决乱码问题
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
sns.set(font='SimHei', font_scale=0.8)
warnings.filterwarnings('ignore')

In [18]:
train = pd.read_csv('../data/train.csv')
label = pd.read_csv('../data/train_label.csv')
test = pd.read_csv('../data/test.csv')
sub = pd.read_csv('../data/submission.csv')

In [19]:
train_x,train_y,test_x,features  = fs.process_features(train, test, label)

In [20]:

# 设置随机种子，保证结果可复现
np.random.seed(42)

# 假设train_x（DataFrame或数组）和train_y（Series或数组）是已有的训练数据
# 请确保在实际使用时已经正确加载了这两个变量

# 确保train_y是一维数组
if isinstance(train_y, pd.Series):
    train_y = train_y.values

# 获取特征数量和特征名称
if isinstance(train_x, pd.DataFrame):
    n_features = train_x.shape[1]
    feature_names = train_x.columns.tolist()
else:
    n_features = train_x.shape[1]
    feature_names = [f'特征{i+1}' for i in range(n_features)]


# 1.基于lgb的stacking

In [21]:
def predict_with_model(model_path, X):
    """
    使用保存的stacking模型进行预测
    
    model_path: 模型pkl文件路径
    X: 待预测的特征数据（DataFrame或数组）
    
    返回:
    预测概率
    """
    # 加载模型
    with open(model_path, 'rb') as f:
        stacking_model = pickle.load(f)
    
    base_models = stacking_model['base_models']
    meta_model = stacking_model['meta_model']
    
    # 生成元特征
    meta_features = np.zeros((X.shape[0], len(base_models)))
    for i, model in enumerate(base_models):
        meta_features[:, i] = model.predict(X, num_iteration=model.best_iteration)
    
    # 元模型预测
    predictions = meta_model.predict(meta_features, num_iteration=meta_model.best_iteration)
    print('预测成功！')
    
    return predictions



In [22]:
pre1 = predict_with_model('./models/stacking_model_with_params.pkl', test_x)
sub['stacking_by_lgb'] = pre1

预测成功！


# 2.基于lgb和xgb的stacking

In [23]:
def predict_with_model(model_path, X):
    """使用保存的stacking模型进行预测"""
    # 加载模型
    with open(model_path, 'rb') as f:
        stacking_model = pickle.load(f)
    
    base_models = stacking_model['base_models']
    meta_model = stacking_model['meta_model']
    
    # 生成元特征
    meta_features = np.zeros((X.shape[0], len(base_models)))
    for i, model_info in enumerate(base_models):
        model = model_info['model']
        model_type = model_info['type']
        
        if model_type == 'lgb':
            meta_features[:, i] = model.predict(X, num_iteration=model.best_iteration)
        elif model_type == 'xgb':
            dmatrix = xgb.DMatrix(X)
            meta_features[:, i] = model.predict(dmatrix, ntree_limit=model.best_iteration)
    
    # 元模型预测
    predictions = meta_model.predict(meta_features, num_iteration=meta_model.best_iteration)
    print('预测成功！')
    
    return predictions

In [24]:
pre2 = predict_with_model('./models/stacking_model_mixed.pkl', test_x)
sub['stacking_by_lgb_and_xgb'] = pre2

预测成功！


# 3.基于lgb的十折交叉验证

In [25]:
# 加载模型并对测试集进行预测
def load_model_and_predict(model_path, test_data):
    """
    加载保存的模型并对测试集进行预测
    
    参数:
    model_path: 模型保存路径
    test_data: 测试数据集（包含特征列）
    features: 特征列名列表
    
    返回:
    测试集预测概率
    """
    # 加载模型
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print(f"模型加载成功: {model_path}")
    except Exception as e:
        print(f"模型加载失败: {str(e)}")
        return None

    # 进行预测
    test_pred = model.predict(test_x, num_iteration=model.best_iteration)
    print('预测成功！')
    
    return test_pred

In [26]:
# 加载模型并预测
model_path = './models/lgb_10fold_model.pkl'  # 模型保存路径
pre3 = load_model_and_predict(model_path, test_x)
    

sub['lgb_10fold_pred'] = pre3

模型加载成功: ./models/lgb_10fold_model.pkl
预测成功！


# 4.基于xgb的十折交叉验证

In [27]:
# 加载模型并对测试集进行预测
def load_model_and_predict(model_path, test_data):
    """
    加载保存的模型并对测试集进行预测
    
    参数:
    model_path: 模型保存路径
    test_data: 测试数据集（包含特征列）
    features: 特征列名列表
    
    返回:
    测试集预测概率
    """
    # 加载模型
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print(f"模型加载成功: {model_path}")
    except Exception as e:
        print(f"模型加载失败: {str(e)}")
        return None
    
    dtest = xgb.DMatrix(test_data)
    
    # 进行预测
    
    test_pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)
    print('预测成功！')
    return test_pred

In [28]:
# 加载模型并预测
model_path = './models/xgb_10fold_model.pkl'  # 模型保存路径
pre4 = load_model_and_predict(model_path, test_x)
    
sub['xgb_10fold_pred'] = pre4
    

模型加载成功: ./models/xgb_10fold_model.pkl
预测成功！


In [29]:
sub.head()

Unnamed: 0,ID,label,stacking_by_lgb,stacking_by_lgb_and_xgb,lgb_10fold_pred,xgb_10fold_pred
0,1,0.1,0.591386,0.067679,0.255563,0.106641
1,4,0.1,0.129605,0.302932,0.184471,0.277002
2,6,0.1,0.352608,0.2399,0.257298,0.250648
3,8,0.1,0.181499,0.547579,0.326018,0.262511
4,10,0.1,0.460039,0.245512,0.235715,0.257917


# 5.基于lgb和xgb的stacking （元模型逻辑回归）

In [30]:
# 预测函数
def predict_with_model(model_path, X):
    with open(model_path, 'rb') as f:
        stacking_model = pickle.load(f)
    
    base_models = stacking_model['base_models']
    meta_model = stacking_model['meta_model']
    scaler = stacking_model['scaler']  # 加载标准化器
    
    meta_features = np.zeros((X.shape[0], len(base_models)))
    for i, model_info in enumerate(base_models):
        model = model_info['model']
        model_type = model_info['type']
        
        if model_type == 'lgb':
            meta_features[:, i] = model.predict(X, num_iteration=model.best_iteration)
        elif model_type == 'xgb':
            dmatrix = xgb.DMatrix(X)
            meta_features[:, i] = model.predict(dmatrix, ntree_limit=model.best_iteration)
    
    # 逻辑回归预测（标准化元特征）
    meta_features_scaled = scaler.transform(meta_features)
    predictions = meta_model.predict_proba(meta_features_scaled)[:, 1]  # 取正例概率
    
    return predictions

# def predict_with_model(model_path, X):
#     with open(model_path, 'rb') as f:
#         stacking_model = pickle.load(f)
    
#     base_models = stacking_model['base_models']
#     meta_model = stacking_model['meta_model']
#     scaler = stacking_model['scaler']  # 加载标准化器
    
#     # 生成元特征
#     meta_features = np.zeros((X.shape[0], len(base_models)))
#     for i, model_info in enumerate(base_models):
#         model = model_info['model']
#         model_type = model_info['type']
        
#         if model_type == 'lgb':
#             meta_features[:, i] = model.predict(X, num_iteration=model.best_iteration)
#         elif model_type == 'xgb':
#             dmatrix = xgb.DMatrix(X)
#             meta_features[:, i] = model.predict(dmatrix, ntree_limit=model.best_iteration)
    
#     # 检查scaler是否已拟合，若未拟合则用当前元特征拟合（仅建议临时使用）
#     # 更好的做法是确保保存模型时scaler已用训练数据拟合
#     if not hasattr(scaler, 'n_features_in_'):
#         print("警告：scaler未找到拟合信息，正在使用当前数据拟合...")
#         scaler.fit(meta_features)
    
#     # 逻辑回归预测（标准化元特征）
#     meta_features_scaled = scaler.transform(meta_features)
#     predictions = meta_model.predict_proba(meta_features_scaled)[:, 1]  # 取正例概率
    
#     return predictions


In [31]:
# 执行预测
pred5 = predict_with_model('./models/stacking_model_logistic.pkl', test_x)
sub['stacking_by_lgb_and_xgb_logistic'] =pred5

In [32]:
sub.head()

Unnamed: 0,ID,label,stacking_by_lgb,stacking_by_lgb_and_xgb,lgb_10fold_pred,xgb_10fold_pred,stacking_by_lgb_and_xgb_logistic
0,1,0.1,0.591386,0.067679,0.255563,0.106641,0.214623
1,4,0.1,0.129605,0.302932,0.184471,0.277002,0.052701
2,6,0.1,0.352608,0.2399,0.257298,0.250648,0.430809
3,8,0.1,0.181499,0.547579,0.326018,0.262511,0.023174
4,10,0.1,0.460039,0.245512,0.235715,0.257917,0.301299


# 6.基于lgb的voting

In [33]:
# 投票预测函数
def voting_predict(models, X, voting_type='soft'):
    """
    投票预测
    voting_type: 'soft'为概率加权平均, 'hard'为多数表决
    """
    # 转换X为DMatrix（如果是XGBoost需要）
    xgb_X = xgb.DMatrix(X) if isinstance(X, pd.DataFrame) else xgb.DMatrix(X.values)
    
    if voting_type == 'hard':
        # 硬投票：取多数类
        predictions = []
        for model_info in models:
            model = model_info['model']
            model_type = model_info['type']
            
            if model_type == 'lgb':
                pred_proba = model.predict(X, num_iteration=model.best_iteration)
            else:  # xgb
                pred_proba = model.predict(xgb_X, ntree_limit=model.best_iteration)
                
            predictions.append((pred_proba >= 0.5).astype(int))
        
        predictions = np.array(predictions).T
        final_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)
        return final_pred
    else:
        # 软投票：概率平均值
        all_preds = []
        for model_info in models:
            model = model_info['model']
            model_type = model_info['type']
            
            if model_type == 'lgb':
                pred = model.predict(X, num_iteration=model.best_iteration)
            else:  # xgb
                pred = model.predict(xgb_X, ntree_limit=model.best_iteration)
                
            all_preds.append(pred)
        
        return np.mean(all_preds, axis=0)

In [34]:
def predict_with_voting_model(model_path, X, voting_type='soft'):
    with open(model_path, 'rb') as f:
        voting_model = pickle.load(f)
    
    base_models = voting_model['base_models']
    return voting_predict(base_models, X, voting_type)

# 生成测试集预测结果
pred6 = predict_with_voting_model('./models/voting_ensemble_model.pkl', test_x)
print("预测成功")

预测成功


In [35]:
# 生成提交文件
sub['voting_by_lgb'] = pred6

# 7. 基于xgb和lgb的voting

In [36]:
# 投票预测函数
def voting_predict(models, X, voting_type='soft'):
    """
    投票预测
    voting_type: 'soft'为概率加权平均, 'hard'为多数表决
    """
    # 转换X为DMatrix（如果是XGBoost需要）
    xgb_X = xgb.DMatrix(X) if isinstance(X, pd.DataFrame) else xgb.DMatrix(X.values)
    
    if voting_type == 'hard':
        # 硬投票：取多数类
        predictions = []
        for model_info in models:
            model = model_info['model']
            model_type = model_info['type']
            
            if model_type == 'lgb':
                pred_proba = model.predict(X, num_iteration=model.best_iteration)
            else:  # xgb
                pred_proba = model.predict(xgb_X, ntree_limit=model.best_iteration)
                
            predictions.append((pred_proba >= 0.5).astype(int))
        
        predictions = np.array(predictions).T
        final_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)
        return final_pred
    else:
        # 软投票：概率平均值
        all_preds = []
        for model_info in models:
            model = model_info['model']
            model_type = model_info['type']
            
            if model_type == 'lgb':
                pred = model.predict(X, num_iteration=model.best_iteration)
            else:  # xgb
                pred = model.predict(xgb_X, ntree_limit=model.best_iteration)
                
            all_preds.append(pred)
        
        return np.mean(all_preds, axis=0)

In [37]:
# 加载模型并预测
def predict_with_voting_model(model_path, X, voting_type='soft'):
    with open(model_path, 'rb') as f:
        voting_model = pickle.load(f)
    
    base_models = voting_model['base_models']
    return voting_predict(base_models, X, voting_type)

# 生成测试集预测结果
pred7 = predict_with_voting_model('./models/voting_ensemble_model.pkl', test_x)
print("预测成功！")

# 生成提交文件
sub['voting_by_lgb_and_xgb'] = pred7

预测成功！


In [38]:
sub.head()

Unnamed: 0,ID,label,stacking_by_lgb,stacking_by_lgb_and_xgb,lgb_10fold_pred,xgb_10fold_pred,stacking_by_lgb_and_xgb_logistic,voting_by_lgb,voting_by_lgb_and_xgb
0,1,0.1,0.591386,0.067679,0.255563,0.106641,0.214623,0.117681,0.117681
1,4,0.1,0.129605,0.302932,0.184471,0.277002,0.052701,0.165264,0.165264
2,6,0.1,0.352608,0.2399,0.257298,0.250648,0.430809,0.209773,0.209773
3,8,0.1,0.181499,0.547579,0.326018,0.262511,0.023174,0.236325,0.236325
4,10,0.1,0.460039,0.245512,0.235715,0.257917,0.301299,0.173463,0.173463


In [39]:
sub = sub.drop(columns = ['label'])

In [40]:
sub.shape

(40000, 8)

In [41]:
list_co = list(sub.columns[1:])

In [42]:
list_co

['stacking_by_lgb',
 'stacking_by_lgb_and_xgb',
 'lgb_10fold_pred',
 'xgb_10fold_pred',
 'stacking_by_lgb_and_xgb_logistic',
 'voting_by_lgb',
 'voting_by_lgb_and_xgb']

### 各模型的阈值(依模型调整) //方案一

In [43]:
sub1 = sub.copy()

In [44]:
#测试阈值
threshold = {'1':0.5,
             '2':0.5,
             '3':0.5,
             '4':0.5,
             '5':0.5,
             '6':0.5,
             '7':0.5
            }

In [45]:
for idx,co in enumerate(list_co,1):
    sub1[co] = sub[co].apply(lambda x:1 if x>threshold[str(idx)] else 0)

In [46]:
sub1.head()

Unnamed: 0,ID,stacking_by_lgb,stacking_by_lgb_and_xgb,lgb_10fold_pred,xgb_10fold_pred,stacking_by_lgb_and_xgb_logistic,voting_by_lgb,voting_by_lgb_and_xgb
0,1,1,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0
3,8,0,1,0,0,0,0,0
4,10,0,0,0,0,0,0,0


### 投票

In [33]:
sub1['label'] = sub1.iloc[:,1:].mode(axis = 1)

In [34]:
sub1 = sub1[['ID','label']]

In [35]:
sub1.to_csv('./result/sub_by_voting.csv',index = False)

In [36]:
sub1.head()

Unnamed: 0,ID,label
0,1,0
1,4,0
2,6,0
3,8,0
4,10,0


### 各模型的auc(依模型调整) //方案二

In [37]:
#测试阈值
auc_score = {'1':0.1,
             '2':0.6,
             '3':0.5,
             '4':0.4,
             '5':0.2,
             '6':0.9,
             '7':0.3
            }

In [38]:
sum_auc_score = 0
for i in range(1,8):
    sum_auc_score +=auc_score[str(i)]

In [39]:
auc_weight = [auc_score[str(i)]/sum_auc_score for i in range(1,8)]

In [40]:
sub2 = sub.copy()

In [41]:
sub2['label'] = 0

In [42]:
for idx,co in enumerate(list_co,1):
    sub2['label'] += auc_weight[idx-1]*sub[co]

In [43]:
sub2 = sub2[['ID','label']]

In [44]:
sub2['label'] = sub2['label'].apply(lambda x:'%.4f'%x)

In [45]:
sub2 = sub2.sort_values(by = 'label',ascending = False)

In [46]:
sub2.to_csv('./result/sub_by_auc.csv', index = False)

### 各模型的F1(依模型调整) //方案三

In [47]:
#测试阈值
F1_score =  {'1':0.8,
             '2':0.5,
             '3':0.5,
             '4':0.2,
             '5':0.5,
             '6':0.5,
             '7':0.1
            }

In [48]:
sum_F1_score = 0
for i in range(1,8):
    sum_F1_score +=F1_score[str(i)]

In [49]:
F1_weight = [F1_score[str(i)]/sum_F1_score for i in range(1,8)]

In [50]:
sub3 = sub.copy()

In [51]:
sub3['label'] = 0

In [52]:
for idx,co in enumerate(list_co,1):
    sub3['label'] += F1_weight[idx-1]*sub[co]

In [53]:
sub3 = sub3[['ID','label']]

In [54]:
sub3['label'] = sub3['label'].apply(lambda x:'%.4f'%x)

In [55]:
sub3 = sub3.sort_values(by = 'label',ascending = False)

In [56]:
sub3.to_csv('./result/sub_by_f1.csv', index = False)

### 各模型的Ks(依模型调整) //方案四

In [57]:
#测试阈值
Ks_score =  {'1':0.8,
             '2':0.3,
             '3':0.5,
             '4':0.2,
             '5':0.5,
             '6':0.9,
             '7':0.8
            }

In [58]:
sum_Ks_score = 0
for i in range(1,8):
    sum_Ks_score +=Ks_score[str(i)]

In [59]:
Ks_weight = [Ks_score[str(i)]/sum_Ks_score for i in range(1,8)]

In [60]:
sub4 = sub.copy()

In [61]:
sub4['label'] = 0

In [62]:
for idx,co in enumerate(list_co,1):
    sub4['label'] += Ks_weight[idx-1]*sub[co]

In [63]:
sub4 = sub4[['ID','label']]

In [64]:
sub4['label'] = sub4['label'].apply(lambda x:'%.4f'%x)

In [65]:
sub4 = sub4.sort_values(by = 'label',ascending = False)

In [66]:
sub4.to_csv('./result/sub_by_ks.csv', index = False)

In [67]:
sub1.head()

Unnamed: 0,ID,label
0,1,0
1,4,0
2,6,0
3,8,0
4,10,0


In [68]:
sub2.head()

Unnamed: 0,ID,label
21997,54863,0.8616
34848,87143,0.8485
24327,60770,0.8445
37932,94798,0.8369
34130,85377,0.8131


In [69]:
sub3.head()

Unnamed: 0,ID,label
21997,54863,0.8677
34848,87143,0.8626
24327,60770,0.856
37932,94798,0.8538
37910,94737,0.8404


In [70]:
sub4.head()

Unnamed: 0,ID,label
21997,54863,0.8645
34848,87143,0.8546
24327,60770,0.8464
37932,94798,0.8412
34130,85377,0.8258
