In [54]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.model_selection import cross_val_score,StratifiedKFold,KFold
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
import joblib

In [55]:
df = pd.read_csv('../data/train.csv')
df

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,1488013,0,3,1,10,0,0,0,0,0,...,4,1,9,6,0,1,1,0,1,1
595208,1488016,0,5,1,3,0,0,0,0,0,...,4,1,3,8,1,0,1,0,1,1
595209,1488017,0,1,1,10,0,0,1,0,0,...,3,2,2,6,0,0,1,0,0,0
595210,1488021,0,5,2,3,1,0,0,0,1,...,4,1,4,2,0,1,1,1,0,0


In [56]:
# 数据集划分
X_columns = df.columns.drop(['id', 'target'])
y_column = ['target']

In [57]:
# 损失函数
#def micro_f1_scorer(y, y_pred):
#    return f1_score(y, y_pred, average='micro')

def gini(actual, pred):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float64)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

gini_normalized_score = make_scorer(gini_normalized)

In [58]:
# 模型训练
models = LGBMRegressor()

scores = cross_val_score(model, df[X_columns], df[y_column], cv=5, scoring=gini_normalized_score)

In [59]:
scores

array([0.27417485, 0.27214519, 0.27408007, 0.28001267, 0.26911096])

In [60]:
X = df[X_columns]
y = df[y_column]

In [61]:
kf = StratifiedKFold(n_splits=5)
models = []

for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    # 训练模型
    model.fit(X_train, y_train)
    
    # 保存模型
    model_filename = f'../models/lgb_model_fold_{fold+1}.pkl'
    joblib.dump(model, model_filename)
    
    # 将模型保存到列表中（可选）
    models.append(model)
    
    # 输出每折的准确率（可选）
    y_pred = model.predict(X_test)
    score = gini_normalized(y_test, y_pred)
    print(f'Fold {fold+1} Score: {score:.4f}')

Fold 1 Score: 0.2749
Fold 2 Score: 0.2718
Fold 3 Score: 0.2756
Fold 4 Score: 0.2818
Fold 5 Score: 0.2685


In [62]:
pred_data = pd.read_csv('../data/test.csv')

In [63]:
submit_data = pd.DataFrame()
submit_data['id'] = pred_data['id']
submit_data['target'] = pd.DataFrame([model.predict(pred_data[X_columns]) for model in models]).mean()

In [64]:
submit_data

Unnamed: 0,id,target
0,0,0.028495
1,1,0.028627
2,2,0.025337
3,3,0.017160
4,4,0.035393
...,...,...
892811,1488022,0.143192
892812,1488023,0.030322
892813,1488024,0.041718
892814,1488025,0.029469


In [65]:
submit_data.to_csv('lgb_stratified_5fold_submission.csv', index=False, sep=',')

In [66]:
kf = KFold(n_splits=5)
models = []

for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    # 训练模型
    model.fit(X_train, y_train)
    
    # 保存模型
    model_filename = f'../models/lgb_model_random_fold_{fold+1}.pkl'
    joblib.dump(model, model_filename)
    
    # 将模型保存到列表中（可选）
    models.append(model)
    
    # 输出每折的准确率（可选）
    y_pred = model.predict(X_test)
    score = gini_normalized(y_test, y_pred)
    print(f'Fold {fold+1} Score: {score:.4f}')

Fold 1 Score: 0.2742
Fold 2 Score: 0.2721
Fold 3 Score: 0.2741
Fold 4 Score: 0.2800
Fold 5 Score: 0.2691


In [67]:
submit_data = pd.DataFrame()
submit_data['id'] = pred_data['id']
submit_data['target'] = pd.DataFrame([model.predict(pred_data[X_columns]) for model in models]).mean()

submit_data.to_csv('lgb_random_5fold_submission.csv', index=False, sep=',')