In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, roc_curve
import warnings
from sklearn.preprocessing import LabelEncoder
import gc
import os
import time
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# 设置中文字体，解决乱码问题
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
sns.set(font='SimHei', font_scale=0.8)
warnings.filterwarnings('ignore')

In [2]:
import xgboost as xgb

In [3]:
train = pd.read_csv('../data/train.csv')
label = pd.read_csv('../data/train_label.csv')
test = pd.read_csv('../data/test.csv')
sub = pd.read_csv('../data/submission.csv')

In [4]:
df_source = train.merge(label,on='ID',how='left')


test['label'] = -1
train = train.merge(label,on='ID',how='left')
data = pd.concat([train, test])


In [5]:
#提取时间特征

def get_time_fe(df):
    df['day'] = df.date.apply(lambda x:int(x[8:10]))
    df['hour'] = df.date.apply(lambda x:int(x[11:13]))
    return df





#对时间分箱
def getSeg(x):
    if x >=0 and x<= 3:
        return 1
    elif x>=4 and x<=12:
        return 2
    elif x>=13 and x<=18:
        return 3
    elif x>=19 and x<=23:
        return 1




#count统计特征
cross_feature = []
def get_cross_fe(df):
    first_feature = [ 'B2', 'B3']
    second_feature = ['C1','C2','C3','D1','A1','A2','A3']
    for feat_1 in first_feature:
        for feat_2 in second_feature:
            col_name = "cross_" + feat_1 + "_and_" + feat_2
            cross_feature.append(col_name)
            df[col_name] = df[feat_1].astype(str).values + '_' + df[feat_2].astype(str).values
    return df



#获取nunique特征
def get_nunique_1_fe(df):
    adid_nuq = [ 'hour','E1','E14','B2','B3']
    for feat in adid_nuq:
        gp1 = df.groupby('A2')[feat].nunique().reset_index().rename(columns={feat: "A2_%s_nuq_num" % feat})
        gp2 = df.groupby(feat)['A2'].nunique().reset_index().rename(columns={'A2': "%s_A2_nuq_num" % feat})
        df = pd.merge(df, gp1, how='left', on=['A2'])
        df = pd.merge(df, gp2, how='left', on=[feat])
    return df
def get_nunique_2_fe(df):
    adid_nuq = [ 'E1','E14']
    for feat in adid_nuq:
        gp1 = df.groupby('hour')[feat].nunique().reset_index().rename(columns={feat: "hour_%s_nuq_num" % feat})
        gp2 = df.groupby(feat)['hour'].nunique().reset_index().rename(columns={'hour': "%s_hour_nuq_num" % feat})
        df = pd.merge(df, gp1, how='left', on=['hour'])
        df = pd.merge(df, gp2, how='left', on=[feat])
    return df

# def get_nunique_3_fe(df):
#     adid_nuq = ['B2','B3']
#     for feat in adid_nuq:
#         gp1 = df.groupby('A3')[feat].nunique().reset_index().rename(columns={feat: "A3_%s_nuq_num" % feat})
#         gp2 = df.groupby(feat)['A3'].nunique().reset_index().rename(columns={'A3': "%s_A3_nuq_num" % feat})
#         df = pd.merge(df, gp1, how='left', on=['A3'])
#         df = pd.merge(df, gp2, how='left', on=[feat])
#     return df

def get_nunique_4_fe(df):
    adid_nuq = [ 'B2','B3']
    for feat in adid_nuq:
        gp1 = df.groupby('A1')[feat].nunique().reset_index().rename(columns={feat: "A1_%s_nuq_num" % feat})
        gp2 = df.groupby(feat)['A1'].nunique().reset_index().rename(columns={'A1': "%s_A1_nuq_num" % feat})
        df = pd.merge(df, gp1, how='left', on=['A1'])
        df = pd.merge(df, gp2, how='left', on=[feat])
    return df




data = get_time_fe(data)
# data['hour_seg'] = data['hour'].apply(lambda x: getSeg(x))
data = get_cross_fe(data)
data = get_nunique_1_fe(data)
data = get_nunique_2_fe(data)
# data = get_nunique_3_fe(data)
# data = get_nunique_4_fe(data)



len(cross_feature)




#labelencoder()
cate_feature = ['A1','A2','A3','B1','B2','B3','C1','C2','C3','E2','E3','E5','E7','E9','E10','E13','E16','E17','E19','E21','E22']
# cross_feature = cross_feature[:15]
cate_features = cate_feature+cross_feature
for item in cate_features:
    data[item] = LabelEncoder().fit_transform(data[item])


# In[14]:


def feature_count(data, features=[]):
    new_feature = 'count'
    for i in features:
        new_feature += '_' + i
    try:
        del data[new_feature]
    except:
        pass
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
    data = data.merge(temp, 'left', on=features)
    return data

for i in cross_feature:
    n = data[i].nunique()
    if n > 5:
        data = feature_count(data, [i])
    else:
        print(i, ':', n)


# In[15]:


#ratio：类别偏好的ratio比例特征
label_feature =[ 'A2', 'A3','hour']
data_temp = data[label_feature]
df_feature = pd.DataFrame()
data_temp['cnt'] = 1
print('Begin ratio clcik...')
col_type = label_feature.copy()
n = len(col_type)
for i in range(n):
    col_name = "ratio_click_of_" + col_type[i]
    df_feature[col_name] = (
                    data_temp[col_type[i]].map(data_temp[col_type[i]].value_counts()) / len(data) * 100).astype(int)            
data = pd.concat([data, df_feature], axis=1)
print('The end')




train_df = data[data['label'] != -1]
test_df = data[data['label'] == -1]


# # 删除不需要的字段



from sklearn.model_selection import train_test_split, StratifiedKFold





## get train feature
del_feature = ['ID','day','date','label','D2']+cross_feature
features = [i for i in train_df.columns if i not in del_feature]




train_x = train_df[features]
train_y = train_df['label'].values
test = test_df[features]





# 设置随机种子，保证结果可复现
np.random.seed(42)

# 假设train_x（DataFrame或数组）和train_y（Series或数组）是已有的训练数据
# 请确保在实际使用时已经正确加载了这两个变量

# 确保train_y是一维数组
if isinstance(train_y, pd.Series):
    train_y = train_y.values

# 获取特征数量和特征名称
if isinstance(train_x, pd.DataFrame):
    n_features = train_x.shape[1]
    feature_names = train_x.columns.tolist()
else:
    n_features = train_x.shape[1]
    feature_names = [f'特征{i+1}' for i in range(n_features)]


Begin ratio clcik...
The end


In [6]:
test_x = test

# 1.基于lgb的stacking

In [7]:
def predict_with_model(model_path, X):
    """
    使用保存的stacking模型进行预测
    
    model_path: 模型pkl文件路径
    X: 待预测的特征数据（DataFrame或数组）
    
    返回:
    预测概率
    """
    # 加载模型
    with open(model_path, 'rb') as f:
        stacking_model = pickle.load(f)
    
    base_models = stacking_model['base_models']
    meta_model = stacking_model['meta_model']
    
    # 生成元特征
    meta_features = np.zeros((X.shape[0], len(base_models)))
    for i, model in enumerate(base_models):
        meta_features[:, i] = model.predict(X, num_iteration=model.best_iteration)
    
    # 元模型预测
    predictions = meta_model.predict(meta_features, num_iteration=meta_model.best_iteration)
    print('预测成功！')
    
    return predictions



In [8]:
pre1 = predict_with_model('./models/stacking_model_with_params.pkl', test_x)
sub['stacking_by_lgb'] = pre1

预测成功！


# 2.基于lgb和xgb的stacking

In [9]:
def predict_with_model(model_path, X):
    """使用保存的stacking模型进行预测"""
    # 加载模型
    with open(model_path, 'rb') as f:
        stacking_model = pickle.load(f)
    
    base_models = stacking_model['base_models']
    meta_model = stacking_model['meta_model']
    
    # 生成元特征
    meta_features = np.zeros((X.shape[0], len(base_models)))
    for i, model_info in enumerate(base_models):
        model = model_info['model']
        model_type = model_info['type']
        
        if model_type == 'lgb':
            meta_features[:, i] = model.predict(X, num_iteration=model.best_iteration)
        elif model_type == 'xgb':
            dmatrix = xgb.DMatrix(X)
            meta_features[:, i] = model.predict(dmatrix, ntree_limit=model.best_iteration)
    
    # 元模型预测
    predictions = meta_model.predict(meta_features, num_iteration=meta_model.best_iteration)
    print('预测成功！')
    
    return predictions



In [10]:
pre2 = predict_with_model('./models/stacking_model_mixed.pkl', test_x)
sub['stacking_by_lgb_and_xgb'] = pre2

预测成功！


# 3.基于lgb的十折交叉验证

In [11]:
# 加载模型并对测试集进行预测
def load_model_and_predict(model_path, test_data):
    """
    加载保存的模型并对测试集进行预测
    
    参数:
    model_path: 模型保存路径
    test_data: 测试数据集（包含特征列）
    features: 特征列名列表
    
    返回:
    测试集预测概率
    """
    # 加载模型
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print(f"模型加载成功: {model_path}")
    except Exception as e:
        print(f"模型加载失败: {str(e)}")
        return None

    # 进行预测
    test_pred = model.predict(test_x, num_iteration=model.best_iteration)
    print('预测成功！')
    
    return test_pred

In [12]:
# 加载模型并预测
model_path = './models/lgb_10fold_model.pkl'  # 模型保存路径
pre3 = load_model_and_predict(model_path, test_x)
    

sub['lgb_10fold_pred'] = pre3

模型加载成功: ./models/lgb_10fold_model.pkl
预测成功！


# 4.基于xgb的十折交叉验证

In [13]:
# 加载模型并对测试集进行预测
def load_model_and_predict(model_path, test_data):
    """
    加载保存的模型并对测试集进行预测
    
    参数:
    model_path: 模型保存路径
    test_data: 测试数据集（包含特征列）
    features: 特征列名列表
    
    返回:
    测试集预测概率
    """
    # 加载模型
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print(f"模型加载成功: {model_path}")
    except Exception as e:
        print(f"模型加载失败: {str(e)}")
        return None
    
    dtest = xgb.DMatrix(test_data)
    
    # 进行预测
    
    test_pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)
    print('预测成功！')
    return test_pred

In [14]:
# 加载模型并预测
model_path = './models/xgb_10fold_model.pkl'  # 模型保存路径
pre4 = load_model_and_predict(model_path, test_x)
    
sub['xgb_10fold_pred'] = pre4
    

模型加载成功: ./models/xgb_10fold_model.pkl
预测成功！


In [15]:
sub.head()

Unnamed: 0,ID,label,stacking_by_lgb,stacking_by_lgb_and_xgb,lgb_10fold_pred,xgb_10fold_pred
0,1,0.1,0.662789,0.299682,0.137528,0.139409
1,4,0.1,0.092998,0.1122,0.223504,0.138446
2,6,0.1,0.257431,0.218318,0.212645,0.230363
3,8,0.1,0.287891,0.309741,0.344971,0.336017
4,10,0.1,0.193517,0.187629,0.238918,0.217965


# 5.基于lgb和xgb的stacking （元模型逻辑回归）

In [16]:
# 预测函数
def predict_with_model(model_path, X):
    with open(model_path, 'rb') as f:
        stacking_model = pickle.load(f)
    
    base_models = stacking_model['base_models']
    meta_model = stacking_model['meta_model']
    scaler = stacking_model['scaler']  # 加载标准化器
    
    meta_features = np.zeros((X.shape[0], len(base_models)))
    for i, model_info in enumerate(base_models):
        model = model_info['model']
        model_type = model_info['type']
        
        if model_type == 'lgb':
            meta_features[:, i] = model.predict(X, num_iteration=model.best_iteration)
        elif model_type == 'xgb':
            dmatrix = xgb.DMatrix(X)
            meta_features[:, i] = model.predict(dmatrix, ntree_limit=model.best_iteration)
    
    # 逻辑回归预测（标准化元特征）
    meta_features_scaled = scaler.transform(meta_features)
    predictions = meta_model.predict_proba(meta_features_scaled)[:, 1]  # 取正例概率
    
    return predictions

In [17]:
# 执行预测
pred5 = predict_with_model('./models/stacking_model_logistic.pkl', test_x)
sub['stacking_by_lgb_and_xgb_logistic'] =pred5

# 6.基于lgb的voting

In [18]:
# 投票预测函数
def voting_predict(models, X, voting_type='soft'):
    """
    投票预测
    voting_type: 'soft'为概率加权平均, 'hard'为多数表决
    """
    # 转换X为DMatrix（如果是XGBoost需要）
    xgb_X = xgb.DMatrix(X) if isinstance(X, pd.DataFrame) else xgb.DMatrix(X.values)
    
    if voting_type == 'hard':
        # 硬投票：取多数类
        predictions = []
        for model_info in models:
            model = model_info['model']
            model_type = model_info['type']
            
            if model_type == 'lgb':
                pred_proba = model.predict(X, num_iteration=model.best_iteration)
            else:  # xgb
                pred_proba = model.predict(xgb_X, ntree_limit=model.best_iteration)
                
            predictions.append((pred_proba >= 0.5).astype(int))
        
        predictions = np.array(predictions).T
        final_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)
        return final_pred
    else:
        # 软投票：概率平均值
        all_preds = []
        for model_info in models:
            model = model_info['model']
            model_type = model_info['type']
            
            if model_type == 'lgb':
                pred = model.predict(X, num_iteration=model.best_iteration)
            else:  # xgb
                pred = model.predict(xgb_X, ntree_limit=model.best_iteration)
                
            all_preds.append(pred)
        
        return np.mean(all_preds, axis=0)

In [19]:
def predict_with_voting_model(model_path, X, voting_type='soft'):
    with open(model_path, 'rb') as f:
        voting_model = pickle.load(f)
    
    base_models = voting_model['base_models']
    return voting_predict(base_models, X, voting_type)

# 生成测试集预测结果
pred6 = predict_with_voting_model('./models/voting_ensemble_model.pkl', test_x)
print("预测成功")

预测成功


In [20]:
# 生成提交文件
sub['voting_by_lgb'] = pred6

# 7. 基于xgb和lgb的voting

In [21]:
# 投票预测函数
def voting_predict(models, X, voting_type='soft'):
    """
    投票预测
    voting_type: 'soft'为概率加权平均, 'hard'为多数表决
    """
    # 转换X为DMatrix（如果是XGBoost需要）
    xgb_X = xgb.DMatrix(X) if isinstance(X, pd.DataFrame) else xgb.DMatrix(X.values)
    
    if voting_type == 'hard':
        # 硬投票：取多数类
        predictions = []
        for model_info in models:
            model = model_info['model']
            model_type = model_info['type']
            
            if model_type == 'lgb':
                pred_proba = model.predict(X, num_iteration=model.best_iteration)
            else:  # xgb
                pred_proba = model.predict(xgb_X, ntree_limit=model.best_iteration)
                
            predictions.append((pred_proba >= 0.5).astype(int))
        
        predictions = np.array(predictions).T
        final_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)
        return final_pred
    else:
        # 软投票：概率平均值
        all_preds = []
        for model_info in models:
            model = model_info['model']
            model_type = model_info['type']
            
            if model_type == 'lgb':
                pred = model.predict(X, num_iteration=model.best_iteration)
            else:  # xgb
                pred = model.predict(xgb_X, ntree_limit=model.best_iteration)
                
            all_preds.append(pred)
        
        return np.mean(all_preds, axis=0)

In [22]:
# 加载模型并预测
def predict_with_voting_model(model_path, X, voting_type='soft'):
    with open(model_path, 'rb') as f:
        voting_model = pickle.load(f)
    
    base_models = voting_model['base_models']
    return voting_predict(base_models, X, voting_type)

# 生成测试集预测结果
pred7 = predict_with_voting_model('./models/voting_ensemble_model.pkl', test_x)
print("预测成功！")

# 生成提交文件
sub['voting_by_lgb_and_xgb'] = pred7

预测成功！


In [23]:
sub.head()

Unnamed: 0,ID,label,stacking_by_lgb,stacking_by_lgb_and_xgb,lgb_10fold_pred,xgb_10fold_pred,stacking_by_lgb_and_xgb_logistic,voting_by_lgb,voting_by_lgb_and_xgb
0,1,0.1,0.662789,0.299682,0.137528,0.139409,0.14026,0.232942,0.232942
1,4,0.1,0.092998,0.1122,0.223504,0.138446,0.314672,0.151084,0.151084
2,6,0.1,0.257431,0.218318,0.212645,0.230363,0.564087,0.233898,0.233898
3,8,0.1,0.287891,0.309741,0.344971,0.336017,0.635625,0.347767,0.347767
4,10,0.1,0.193517,0.187629,0.238918,0.217965,0.477566,0.204013,0.204013


In [24]:
sub = sub.drop(columns = ['label'])

In [25]:
sub.shape

(40000, 8)

In [26]:
list_co = list(sub.columns[1:])

In [27]:
list_co

['stacking_by_lgb',
 'stacking_by_lgb_and_xgb',
 'lgb_10fold_pred',
 'xgb_10fold_pred',
 'stacking_by_lgb_and_xgb_logistic',
 'voting_by_lgb',
 'voting_by_lgb_and_xgb']

### 各模型的阈值(依模型调整) //方案一

In [28]:
sub1 = sub.copy()

In [29]:
#测试阈值
threshold = {'1':0.5,
             '2':0.5,
             '3':0.5,
             '4':0.5,
             '5':0.5,
             '6':0.5,
             '7':0.5
            }

In [30]:
for idx,co in enumerate(list_co,1):
    sub1[co] = sub[co].apply(lambda x:1 if x>threshold[str(idx)] else 0)

In [31]:
sub1.head()

Unnamed: 0,ID,stacking_by_lgb,stacking_by_lgb_and_xgb,lgb_10fold_pred,xgb_10fold_pred,stacking_by_lgb_and_xgb_logistic,voting_by_lgb,voting_by_lgb_and_xgb
0,1,1,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0
2,6,0,0,0,0,1,0,0
3,8,0,0,0,0,1,0,0
4,10,0,0,0,0,0,0,0


### 投票

In [32]:
sub1['label'] = sub1.iloc[:,1:].mode(axis = 1)

In [33]:
sub1 = sub1[['ID','label']]

In [34]:
sub1.to_csv('./result/sub_by_voting.csv',index = False)

In [35]:
sub1.head()

Unnamed: 0,ID,label
0,1,0
1,4,0
2,6,0
3,8,0
4,10,0


### 各模型的auc(依模型调整) //方案二

In [36]:
#测试阈值
auc_score = {'1':0.1,
             '2':0.6,
             '3':0.5,
             '4':0.4,
             '5':0.2,
             '6':0.9,
             '7':0.3
            }

In [37]:
sum_auc_score = 0
for i in range(1,8):
    sum_auc_score +=auc_score[str(i)]

In [38]:
auc_weight = [auc_score[str(i)]/sum_auc_score for i in range(1,8)]

In [39]:
sub2 = sub.copy()

In [40]:
sub2['label'] = 0

In [41]:
for idx,co in enumerate(list_co,1):
    sub2['label'] += auc_weight[idx-1]*sub[co]

In [42]:
sub2 = sub2[['ID','label']]

In [43]:
sub2['label'] = sub2['label'].apply(lambda x:'%.4f'%x)

In [44]:
sub2 = sub2.sort_values(by = 'label',ascending = False)

In [45]:
sub2.to_csv('./result/sub_by_auc.csv', index = False)

### 各模型的F1(依模型调整) //方案三

In [70]:
#测试阈值
F1_score =  {'1':0.8,
             '2':0.5,
             '3':0.5,
             '4':0.2,
             '5':0.5,
             '6':0.5,
             '7':0.1
            }

In [71]:
sum_F1_score = 0
for i in range(1,8):
    sum_F1_score +=F1_score[str(i)]

In [72]:
F1_weight = [F1_score[str(i)]/sum_F1_score for i in range(1,8)]

In [73]:
sub3 = sub.copy()

In [74]:
sub3['label'] = 0

In [75]:
for idx,co in enumerate(list_co,1):
    sub3['label'] += F1_weight[idx-1]*sub[co]

In [76]:
sub3 = sub3[['ID','label']]

In [77]:
sub3['label'] = sub3['label'].apply(lambda x:'%.4f'%x)

In [78]:
sub3 = sub3.sort_values(by = 'label',ascending = False)

In [79]:
sub3.to_csv('./result/sub_by_f1.csv', index = False)

### 各模型的Ks(依模型调整) //方案四

In [94]:
#测试阈值
Ks_score =  {'1':0.8,
             '2':0.3,
             '3':0.5,
             '4':0.2,
             '5':0.5,
             '6':0.9,
             '7':0.8
            }

In [95]:
sum_Ks_score = 0
for i in range(1,8):
    sum_Ks_score +=Ks_score[str(i)]

In [96]:
Ks_weight = [Ks_score[str(i)]/sum_Ks_score for i in range(1,8)]

In [97]:
sub4 = sub.copy()

In [98]:
sub4['label'] = 0

In [99]:
for idx,co in enumerate(list_co,1):
    sub4['label'] += Ks_weight[idx-1]*sub[co]

In [100]:
sub4 = sub4[['ID','label']]

In [101]:
sub4['label'] = sub4['label'].apply(lambda x:'%.4f'%x)

In [102]:
sub4 = sub4.sort_values(by = 'label',ascending = False)

In [103]:
sub4.to_csv('./result/sub_by_ks.csv', index = False)

In [104]:
sub1.head()

Unnamed: 0,ID,label
0,1,0
1,4,0
2,6,0
3,8,0
4,10,0


In [105]:
sub2.head()

Unnamed: 0,ID,label
21997,54863,0.8605
24327,60770,0.8435
34848,87143,0.8423
37932,94798,0.8326
34130,85377,0.828


In [106]:
sub3.head()

Unnamed: 0,ID,label
21997,54863,0.8665
34848,87143,0.8566
24327,60770,0.855
37932,94798,0.8499
34130,85377,0.8417


In [107]:
sub4.head()

Unnamed: 0,ID,label
21997,54863,0.8636
34848,87143,0.8499
24327,60770,0.8456
37932,94798,0.8382
34130,85377,0.8374
