In [1]:
import pandas as pd
from tqdm import tqdm
import warnings
import gc
import os
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import time
from itertools import combinations

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
seed = 2021

In [3]:
df_train = pd.read_csv('/home/mw/input/pre8881/train.csv')
df_test = pd.read_csv('/home/mw/input/pretest_a3048/test_a.csv')

In [4]:
df_feature = df_train.append(df_test, sort=False)

In [5]:
df_feature['tp_ratio'] = df_feature['nprem_tp'] / df_feature['si_tp']

In [6]:
# 计数
for f in [['dpt'], ['client_no'], ['trademark_cn'], ['brand_cn'], ['make_cn'], ['series']]:
    df_temp = df_feature.groupby(f).size().reset_index()
    df_temp.columns = f + ['{}_count'.format('_'.join(f))]
    df_feature = df_feature.merge(df_temp, how='left')

In [7]:
df_feature['birth_month'] = df_feature['birth_month'].apply(
    lambda x: int(x[:-1]) if type(x) != float else 0)

In [20]:
# 简单统计
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()

    return df_merge


def statis_feat(df_know, df_unknow):
    for f in tqdm(['p1_census_register', 'dpt','p2_marital_status',\
    'trademark_cn','capab','xz','xb','series','seats','birth_month',\
    'p1_gender','p1_age','f1_child_flag','p2_client_grade','w1_pc_wx_use_flag']):
        df_unknow = stat(df_know, df_unknow, [f], {
                         'y1_is_purchase': ['mean']})

    return df_unknow

In [21]:
# 5折交叉
df_train = df_feature[~df_feature['y1_is_purchase'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['y1_is_purchase'].isnull()]

df_stas_feat = None
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kfold.split(df_train, df_train['y1_is_purchase']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

100%|██████████| 15/15 [00:02<00:00,  5.96it/s]
100%|██████████| 15/15 [00:02<00:00,  5.84it/s]
100%|██████████| 15/15 [00:02<00:00,  5.92it/s]
100%|██████████| 15/15 [00:02<00:00,  5.46it/s]
100%|██████████| 15/15 [00:02<00:00,  5.49it/s]
100%|██████████| 15/15 [00:02<00:00,  6.15it/s]


29

In [22]:
df_feature.head()

Unnamed: 0,client_no,dpt,xz,xb,carid,nprem_ly,ncd_ly,newvalue,bi_renewal_year,clmnum,regdate,trademark_cn,brand_cn,make_cn,series,capab,seats,use_type,change_owner,nprem_od,si_od,nprem_tp,si_tp,nprem_bt,si_bt,nprem_vld,si_vld,nprem_vlp,si_vlp,p1_prior_days_to_insure,suiche_nonauto_nprem_20,suiche_nonauto_nprem_19,suiche_nonauto_nprem_18,suiche_nonauto_nprem_17,suiche_nonauto_nprem_16,suiche_nonauto_amount_20,suiche_nonauto_amount_19,suiche_nonauto_amount_18,suiche_nonauto_amount_17,suiche_nonauto_amount_16,num_notcar_claim,p1_gender,p1_age,p1_census_register,p2_marital_status,f1_child_flag,f2_posses_house_flag,f2_cust_housing_price_total,p2_client_grade,w1_pc_wx_use_flag,p1_is_bank_eff,p2_is_enterprise_owner,p2_is_smeowner,active_7,active_30,active_90,active_365,p2_is_child_under_15_family,p2_is_adult_over_55_family,birth_month,p1_service_offer_cnt,p3_service_use_cnt,dur_personal_insurance_90,service_score_available,y1_is_purchase,tp_ratio,dpt_count,client_no_count,trademark_cn_count,brand_cn_count,make_cn_count,series_count,p1_census_register_y1_is_purchase_mean_x,dpt_y1_is_purchase_mean_x,p2_marital_status_y1_is_purchase_mean_x,trademark_cn_y1_is_purchase_mean_x,capab_y1_is_purchase_mean_x,client_no_y1_is_purchase_mean,p1_census_register_y1_is_purchase_mean_y,dpt_y1_is_purchase_mean_y,p2_marital_status_y1_is_purchase_mean_y,trademark_cn_y1_is_purchase_mean_y,capab_y1_is_purchase_mean_y,xz_y1_is_purchase_mean_x,xb_y1_is_purchase_mean_x,series_y1_is_purchase_mean_x,seats_y1_is_purchase_mean_x,p1_census_register_y1_is_purchase_mean,dpt_y1_is_purchase_mean,p2_marital_status_y1_is_purchase_mean,trademark_cn_y1_is_purchase_mean,capab_y1_is_purchase_mean,xz_y1_is_purchase_mean_y,xb_y1_is_purchase_mean_y,series_y1_is_purchase_mean_y,seats_y1_is_purchase_mean_y,birth_month_y1_is_purchase_mean,p1_gender_y1_is_purchase_mean,p1_age_y1_is_purchase_mean,f1_child_flag_y1_is_purchase_mean,p2_client_grade_y1_is_purchase_mean,w1_pc_wx_use_flag_y1_is_purchase_mean
0,148403,211,2,0,nLHuTvmARSCZ/ndFwcoJ7NXw5jDXXQpUxm3zjL72ByA=,2888.0,0.6,161800,1,0,2010-02-26 00:00:00,104,48,902,671,1.997,5,1,1,1020.72,46274.8,747.23,1000000,0.0,0.0,0.0,0,0.0,0,30.0,0.0,180.0,0.0,0.0,0.0,0.0,240000.0,0.0,0.0,0.0,,0,51.0,416,2,2,2,0.0,1,2,2,0,0,0.0,0.0,17.0,64.0,1,1,12,0.0,0.0,,0.0,0.0,0.000747,6452,1.0,43168,43980,1037,5430.0,0.646077,0.654333,0.624961,0.650097,0.658737,,0.653683,0.66035,0.624612,0.650113,0.658353,0.685505,0.718677,0.642069,0.639666,0.643585,0.65722,0.624965,0.650944,0.660688,0.685427,0.718745,0.631318,0.639551,0.628127,0.625141,0.651658,0.64193,0.348526,0.644165
1,735470,213,2,1,WbmN8Dw9JB8eX+8CTiBd1X4rjGpwvTSeejtlkrBFu1Q=,1371.0,0.6,132800,6,0,2007-01-18 00:00:00,29,38,2855,599,1.598,5,1,1,0.0,0.0,614.01,1000000,0.0,0.0,0.0,0,0.0,0,30.0,564.9,574.9,0.0,153.0,0.0,3940000.0,3940000.0,0.0,240000.0,0.0,,0,61.0,360,0,1,1,0.0,2,2,1,0,0,0.0,1.0,18.0,134.0,1,2,10,0.0,0.0,17947.0,50638.0,1.0,0.000614,10062,1.0,45005,30407,461,14588.0,0.680288,0.646865,0.586444,0.642397,0.654883,,0.681004,0.657618,0.586633,0.642295,0.655494,0.685505,0.581749,0.61008,0.639666,0.680622,0.655518,0.587568,0.643592,0.65494,0.685427,0.581407,0.616049,0.639551,0.631658,0.625141,0.618156,0.629261,0.671066,0.644165
2,769384,220,2,0,xszecXinKVm7eoA+SDKOZcbUl69XJNXnIRS7ejxXqJM=,4234.0,0.6,298900,5,0,2017-02-07 00:00:00,29,25,2739,806,1.998,5,1,1,1958.65,234337.6,974.61,1000000,0.0,0.0,0.0,0,0.0,0,30.0,1064.9,0.0,0.0,470.0,280.0,4378000.0,0.0,0.0,1278000.0,228000.0,,0,61.0,307,2,1,1,0.0,2,1,1,0,0,0.0,9.0,32.0,49.0,1,2,11,0.0,0.0,,293.0,1.0,0.000975,24358,1.0,45005,2702,1248,3450.0,0.578882,0.56974,0.624961,0.642397,0.639125,,0.577127,0.570482,0.624612,0.642295,0.643437,0.685505,0.718677,0.726934,0.639666,0.584521,0.572555,0.624965,0.643592,0.641576,0.685427,0.718745,0.726562,0.639551,0.633124,0.625141,0.618156,0.629261,0.671066,0.605999
3,193652,212,2,0,p6G9XnUFGaRNfDMUrgOl8f7wfp1yqczfBg6kJ2HpZ68=,11091.0,0.85,810000,6,0,2009-02-18 00:00:00,80,259,5946,119,2.996,5,1,1,6163.35,173340.0,1151.0,1000000,0.0,0.0,22.22,10000,56.36,10000,45.0,190.0,0.0,0.0,153.0,0.0,1050000.0,0.0,0.0,240000.0,0.0,,0,61.0,418,2,1,2,0.0,6,2,1,2,2,0.0,0.0,6.0,7.0,1,2,4,0.0,0.0,,5.0,0.0,0.001151,41037,1.0,11222,4948,328,575.0,0.671236,0.607833,0.624961,0.583928,0.622736,,0.667291,0.606052,0.624612,0.578295,0.622462,0.685505,0.718677,0.609418,0.639666,0.659658,0.609265,0.624965,0.582872,0.624682,0.685427,0.718745,0.615804,0.639551,0.635276,0.625141,0.618156,0.629261,0.780561,0.644165
4,134705,206,2,1,X4qqUlsri+lZWZ62pS5H5lYl9Y1DMv7STKd/HddbouE=,1756.2,0.6,169800,3,0,2017-01-25 00:00:00,87,238,2443,567,1.991,7,1,1,0.0,0.0,857.57,1000000,0.0,0.0,0.0,0,0.0,0,17.0,9.9,0.0,0.0,0.0,0.0,1000000.0,0.0,0.0,0.0,0.0,,0,44.0,347,2,1,2,37.75,2,2,1,0,0,0.0,0.0,5.0,10.0,1,1,3,0.0,0.0,,2.0,1.0,0.000858,34301,1.0,7013,6996,534,643.0,0.675887,0.623705,0.624961,0.70844,0.604453,,0.684524,0.621797,0.624612,0.706574,0.603489,0.685505,0.581749,0.68408,0.62029,0.67364,0.624821,0.624965,0.705444,0.608603,0.685427,0.581407,0.674699,0.62152,0.63217,0.625141,0.648514,0.629261,0.671066,0.644165


In [23]:
df_feature.shape

(871208, 102)

### 模型训练

In [24]:
for f in list(df_feature.select_dtypes('object')):
    if f in ['carid', 'regdate']:
        continue
    le = LabelEncoder()
    df_feature[f] = le.fit_transform(
        df_feature[f].astype('str')).astype('int')

In [25]:
df_train = df_feature[df_feature['y1_is_purchase'].notnull()]
df_test = df_feature[df_feature['y1_is_purchase'].isnull()]

In [26]:
ycol = 'y1_is_purchase'
drop_col=['use_type','num_notcar_claim',\
'p2_is_child_under_15_family','suiche_nonauto_amount_18','p2_is_adult_over_55_family',\
'p2_is_smeowner','suiche_nonauto_nprem_18','p2_is_enterprise_owner']
drop_col=['use_type','num_notcar_claim','p2_is_child_under_15_family','suiche_nonauto_nprem_18']
feature_names = list(
    filter(lambda x: x not in [ycol, 'regdate', 'carid',]+drop_col, df_train.columns))

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=8,
                           learning_rate=0.05,
                           n_estimators=2000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           n_jobs=2,
                           metric=None)

oof = []
prediction = df_test[['carid']]
prediction['label'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(
        df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['valid'],
                          eval_set=[(X_val, Y_val)],
                          verbose=400,
                          eval_metric='auc',
                          early_stopping_rounds=50,)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof = df_train.iloc[val_idx][[
        'carid', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['label'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds.
[400]	valid's auc: 0.896256
[800]	valid's auc: 0.897295
Early stopping, best iteration is:
[827]	valid's auc: 0.897312


Training until validation scores don't improve for 50 rounds.
[400]	valid's auc: 0.897447
[800]	valid's auc: 0.898322
Early stopping, best iteration is:
[978]	valid's auc: 0.898456


Training until validation scores don't improve for 50 rounds.
[400]	valid's auc: 0.895494
[800]	valid's auc: 0.896487
Early stopping, best iteration is:
[809]	valid's auc: 0.896508


Training until validation scores don't improve for 50 rounds.
[400]	valid's auc: 0.897117
[800]	valid's auc: 0.898155
Early stopping, best iteration is:
[1009]	valid's auc: 0.898482


Training until validation scores don't improve for 50 rounds.
[400]	valid's auc: 0.895363
[800]	valid's auc: 0.896192
Early stopping, best iteration is:
[821]	valid's auc: 0.896224


In [27]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,p1_prior_days_to_insure,2290.8
1,suiche_nonauto_nprem_20,1792.8
2,suiche_nonauto_amount_20,1786.4
3,tp_ratio,1733.8
4,dpt,1486.4
5,p2_client_grade_y1_is_purchase_mean,1420.0
6,dpt_count,1373.6
7,suiche_nonauto_nprem_19,1271.8
8,active_365,1246.4
9,nprem_tp,1200.4


In [28]:
df_oof = pd.concat(oof)
score = roc_auc_score(df_oof['y1_is_purchase'], df_oof['pred'])

In [29]:
score

0.8973972287142493

In [24]:
df_oof.head(20)

Unnamed: 0,carid,y1_is_purchase,pred
6,DC0L4K+KmhXXNhWyHomtC7XKbQeCcYeKRKxQFMjH2HE=,1.0,0.867365
8,2/ZMZrX50dKrGo5BPd+oOniQC4pnA0vyass/OYK7gxQ=,1.0,0.835398
13,oigWb+fdFD9HX9JV+SwfuCD3zDI52l/1wGTSVGw28Sc=,0.0,0.000483
18,aucvSqOHSB7lKzqXw+BMb/kFWm/m6Uo2wxxGAEgg/lE=,1.0,0.866114
19,i2Op1vhPz30Fm+SyaoSkx7QCZ1KzuU3eYLoQAjAIlxw=,0.0,0.505387
24,8B4skhHUQDdqwGNCNUOHPbBw+KplHUJjFbio4g3hPMk=,1.0,0.918201
30,NZZcjXsF3i+DZ3OKhvYhQkv+r2lxJLZHv5IUmgHLxeA=,0.0,0.130239
38,nLHuTvmARSCZ/ndFwcoJ7NXw5jDXXQpUxm3zjL72ByA=,0.0,0.612278
45,qL8ccRXL0IcqNwnFiCoWxYFX2vdQAU24lKaOt+mPzeg=,1.0,0.566524
46,WbmN8Dw9JB8eX+8CTiBd1X4rjGpwvTSeejtlkrBFu1Q=,1.0,0.907598


In [26]:
prediction.head()

Unnamed: 0,carid,label
0,ZhCSxFMK2Pv4mkVwSNQeF0HwKQpm78mD4OA4t/gX79k=,0.567877
1,MN3k6VEj1c1yflYHmZbnChWcB4YFunAJxn2a/oNibMo=,0.230213
2,q0hlS2GQL/TvAvJ4QaNYOmnostsrxOd47UguA9jfsL0=,0.74563
3,vq73YiH8neXWAG6GLRqKNmtbcMt6VexhGjdGHpqdVTg=,0.849913
4,IAJ37++ziuqd5wAgRKP5maZjY0hmgqo5D5Wi1CENlsM=,0.722855


In [30]:
os.makedirs('sub', exist_ok=True)
prediction.to_csv(f'sub/{score}.csv', index=False)
prediction.to_csv(f'sub/sub.csv', index=False)

In [34]:
# !./heywhale_submit -token 0fd18b304bae4542 -file sub/sub.csv

Heywhale Submit Tool 4.0.0

> 已验证Token
> 提交文件 sub/sub.csv (11782.30 KiB), Target Qiniu
> 已上传 100 %
> 文件已上传        
> 服务器响应: 200 提交成功，请等待评审完成
> 提交完成
