In [30]:
import ast
import re
import warnings
import logging
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from sklearn.decomposition import PCA
import random
random_state = 2023
random.seed(2023)
# 检测异常值
from sklearn.ensemble import IsolationForest
import os
import time

import seaborn as sns
from scipy import stats 
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt


from sklearn.metrics import roc_auc_score,f1_score

from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import lightgbm as lgb

# from hyperopt import fmin, tpe, hp, Trials, space_eval, partial

import datetime

from sklearn.preprocessing import StandardScaler
import time

In [27]:
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
def lgbModel(y, train_scaler_X, test_scaler_X, feature_name, params=None, n_splits = 5, n_jobs=8):
    ovr_oof = np.zeros((len(train_scaler_X), ))
    ovr_preds = np.zeros((len(test_scaler_X),))
    importance = np.zeros((len(feature_name), ))

    # 定义 LightGBM 模型参数
    if params is None:
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'max_depth': 5,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': -1
        }


    kf = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    callbacks = [log_evaluation(period=1000), early_stopping(stopping_rounds=300)]

        # 确定分类特征的列名
    cat_features_indices = [col for _, col in enumerate(feature_name) if train_scaler_X[col].nunique() == 2]
    if len(train_scaler_X.select_dtypes(include=['object']).columns.to_list())  > 0:
        cat_features_indices.append(train_scaler_X.select_dtypes(include=['object']).columns.to_list())
    print("二分类特征有：", len(cat_features_indices))

    feas = list(feature_name)

    #模型训练
    for fold_, (train_index, valid_index) in enumerate(kf.split(train_scaler_X, y)):
        print("--------------------------i {} --- fold n°{}--------------------------".format(1, fold_ + 1))
        X_train, X_valid = train_scaler_X.loc[train_index, feature_name], train_scaler_X.loc[valid_index, feature_name]
        y_train, y_valid = y[train_index], y[valid_index]

        # 定义数据集
        train_set = lgb.Dataset(X_train, y_train, feature_name=feas,
                                categorical_feature=cat_features_indices
                                )
        val_set = lgb.Dataset(X_valid, y_valid, feature_name=feas,
                              categorical_feature=cat_features_indices,
                              reference=train_set
                             )

        lgb_model = lgb.train(params,
                              train_set,
                              valid_sets=[train_set, val_set],
                              num_boost_round=10000,
                              callbacks=callbacks
                              )
        ovr_oof[valid_index] = lgb_model.predict(X_valid, num_iteration=lgb_model.best_iteration)

        # ovr_preds_list.append(lgb_model.predict(test_scaler_X, num_iteration=lgb_model.best_iteration))
        # valid_score.append(roc_auc_score(y[valid_index, num_classes[i]], ovr_oof[valid_index, i]))
        ovr_preds += lgb_model.predict(test_scaler_X[feature_name], num_iteration=lgb_model.best_iteration) / n_splits

        importance += lgb_model.feature_importance(importance_type='gain')
        

    y_true = y
    y_pred = ovr_oof

    auc = roc_auc_score(y_true, y_pred)


    # 打印结果

    print("AUC：", auc)

    return ovr_oof, ovr_preds, importance

In [20]:
test_data = pd.read_csv('../data/A榜_20230720/testa.csv')
# test_data.head()

In [21]:
train_data = pd.read_csv('../data/A榜_20230720/train_20230720.csv')

# train_data.head()

In [22]:
fillna = 0.001

# arpu 单月收入
# avg_3mon_dou 三月均流量
train_data['arpu'] = train_data['arpu'].apply(lambda x : float(x) if x not in ['\\N'] else fillna)
test_data['arpu'] = test_data['arpu'].apply(lambda x : float(x) if x not in ['\\N'] else fillna)
test_data['avg_3mon_dou'] = test_data['avg_3mon_dou'].apply(lambda x : float(x) if x not in ['\\N'] else fillna)
train_data['avg_3mon_dou'] = train_data['avg_3mon_dou'].apply(lambda x : float(x) if x not in ['\\N'] else fillna)

In [23]:
del_fea = [
    'unlimit_flag', 'newuser_flag', 
]
train_data.drop(del_fea, axis=1, inplace=True)
test_data.drop(del_fea, axis=1, inplace=True)

In [24]:
test_id = test_data['id_no']
label = train_data['flag']

for fea in ['id_no', 'flag', 'is_test']:
    if fea in train_data.columns:
        train_data.drop(fea, axis=1, inplace=True)
    if fea in test_data.columns:
        test_data.drop(fea, axis=1, inplace=True)

In [25]:
feature_name = [fea for fea in train_data.columns if fea not in ['id_no', 'label']]
print(len(feature_name))

58


In [28]:
start_time = time.time()

n_splits = 8


ovr_oof, ovr_preds, importance = lgbModel(
    label, 
    train_data, 
    test_data, 
    feature_name,
    n_splits = n_splits,
)
end_time = time.time()
print(f"运行时间：{(end_time - start_time)/60}分")

二分类特征有： 7
--------------------------i 1 --- fold n°1--------------------------
Training until validation scores don't improve for 300 rounds
[1000]	training's auc: 0.965515	valid_1's auc: 0.911694
Early stopping, best iteration is:
[827]	training's auc: 0.959367	valid_1's auc: 0.911829
--------------------------i 1 --- fold n°2--------------------------
Training until validation scores don't improve for 300 rounds
[1000]	training's auc: 0.965868	valid_1's auc: 0.91141
Early stopping, best iteration is:
[986]	training's auc: 0.965378	valid_1's auc: 0.911457
--------------------------i 1 --- fold n°3--------------------------
Training until validation scores don't improve for 300 rounds
[1000]	training's auc: 0.966004	valid_1's auc: 0.909759
Early stopping, best iteration is:
[1111]	training's auc: 0.969421	valid_1's auc: 0.909943
--------------------------i 1 --- fold n°4--------------------------
Training until validation scores don't improve for 300 rounds
[1000]	training's auc: 0.965

In [31]:
auc = roc_auc_score(label, ovr_oof)
scc = -np.inf
for threshold in np.arange(0, 1.01, 0.01):
    yhat = (ovr_oof > threshold).astype(int)
    f1 = f1_score(label, yhat)
    sc = 0.5*auc + 0.5*f1
    if scc < sc:
        scc = sc
        best_threshold = threshold

yhat = (ovr_oof > best_threshold).astype(int)
f1 = f1_score(label, yhat)
print(f"AUC: {auc}, f1: {f1}")
print("0.5*auc + 0.5*f1 确定阈值: {:.5f}, score: {:.5f}\n".format(best_threshold, scc))

AUC: 0.9115963375738335, f1: 0.697084318360914
0.5*auc + 0.5*f1 确定阈值: 0.34000, score: 0.80434



In [32]:
threshold = 0.34000  #目标评价指标 确定阈值
subimt = pd.DataFrame()
subimt['id_no'] = test_id
subimt['predprob'] = ovr_preds
subimt['predtype'] = (ovr_preds > threshold).astype(int)
subimt['predtype'].value_counts()

0    10795
1     3184
Name: predtype, dtype: int64

In [None]:
subimt.to_csv('../data/A榜_20230720/subimt.csv', index=False)