In [3]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, make_scorer, accuracy_score

def cv_model(clf, X_train, y_train, X_test, clf_name, folds=5, seed=42):
    '''   
    params: 
        clf: 分类模型
        X_train: 训练集特征，DataFrame对象
        y_trian: 训练集标签
        X_test: 测试集
        clf_name: 分类模型的名称
        folds: 折数
        seed: 随机种子数值
    '''
    kfold = KFold(n_splits=folds, shuffle=True, random_state=True)
    cv_scores = []
    oof = np.zeros(X_train.shape[0])
    test_predict = np.zeros(X_test.shape[0])
    for i, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        print("********************** || **********************")
        # 划分训练集和验证集
        x_train_fold, y_train_fold = X_train.iloc[train_index], y_train[train_index]
        x_val_fold, y_val_fold = X_train[val_index], y_train[val_index]

        if clf_name == 'lgbm':
            # 创建 Dataset 对象
            train_data = clf.Dataset(data=x_train_fold,
                                     label=y_train_fold,
                                     weight=None) 
            val_data = clf.Dataset(data=x_val_fold,
                                   label=y_val_fold,
                                   weight=None)
            
            lgbm_params = {
                'boosting_type': 'gbdt',  # 提升方法类型
                'objective': 'binary',  # 指定学习任务和相应的损失函数
                'min_child_weight': 6,  # 每个叶子的最小权重和
                'num_leaves': 2 ** 6,  # 树上的最大叶子数
                'lambda_l2': 10,  # L2 正则化系数, 用于控制模型的复杂度
                'feature_fraction': 0.8,  # 每次迭代中使用的特征比列
                'bagging_fraction': 0.8,  # 每次迭代时用于训练的数据比例
                'bagging_freq': 4,  # bagging 的频率
                'learning_rate': 0.25,  # 学习率
                'seed': 42,  
                'nthread': -1, 
                'verbose': -1,
            }
            model = clf.trian(params=lgbm_params,
                              train_set=train_data,
                              num_boost_round=1000,
                              valid_sets=[train_data, val_data],
                              early_stopping_rounds=100,
                              verbose_eval=False)
            val_pred = model.predict(x_val_fold, num_iteration=model.best_iteration)
            test_pred  = model.predict(X_test, num_iteration=model.best_iteration)
        
        if clf_name == 'xgb':

            # 读取数据
            train_data = clf.DMatrix(x_train_fold, label=y_train_fold)
            val_data = clf.DMatrix(x_val_fold, label=y_val_fold)

            xgb_params = {
                'booster': 'gbtree',
                'objective': 'binary:logistic',
                'num_class': 1, # 类别数量
                'max_depth': 6,
                'lambda': 10,
                'subsample': 0.7,  # 每棵树训练时使用的样本比例
                'colsample_bytree': 0.7,  # 每棵树训练时使用的特征比例
                'colsample_bylevel': 0.7,  # 每个数层级进行采样的比例
                'eta': 0.2,  # 学习率
                'tree_method': 'hist',  # 构建树的方法
                'seed': 42
            }
            watchlist = [(train_data, 'train'), (val_data, 'eval')]
            model = clf.train(xgb_params,
                              train_data,
                              num_boost_round=2000,
                              evals=watchlist,
                              verbose_eval=1000,
                              early_stopping_roun=100)
            val_pred = model.predict(val_data)
            test_pred = model.predict(X_test)

        if clf_name == 'catboost':
            params = {
                'learning_rate': 0.2, 
                'depth': 6,
                'bootsrap_type': 'Bernoulli',
                'random_seed': 11,
                'od_type': 'Iter',
                'od_wait': 100,
                'allow_writing_files': False
            }
            model = clf(iteration=2000, **params)
            model.fit(x_train_fold,
                      y_train_fold,
                      eval_set=(x_val_fold, y_val_fold),
                      metric_period=1000,
                      use_best_model=True,
                      cat_features=[],
                      verbose=1)
            
            val_pred = model.predict_proba(x_val_fold)
            test_pred = model.predict_proba(X_test)

        oof[val_index] = val_pred
        test_predict += test_pred / kfold.get_n_splits()

        F1_score = f1_score(y_val_fold, np.where(val_pred>0.5, 1, 0))
        cv_scores.append(F1_score)
        print(cv_scores)

    return oof, test_predict
            





            

