In [30]:
import warnings

warnings.simplefilter('ignore')

import os
import re
import gc
import json

import numpy as np
import pandas as pd

pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)
# from tqdm.autonotebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

In [5]:
train = pd.read_csv('dataset/train_dataset.csv', sep='\t')
train.shape

(15016, 19)

In [7]:
test = pd.read_csv('dataset/test_dataset.csv', sep='\t')
test.shape

(10000, 18)

In [8]:
train.head()

Unnamed: 0,session_id,op_date,user_name,action,auth_type,ip,ip_location_type_keyword,ip_risk_level,location,client_type,browser_source,device_model,os_type,os_version,browser_type,browser_version,bus_system_code,op_target,risk_label
0,access:test_d:20180101111639:bBp1,2018/1/1 11:16,test_d,login,otp,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,chrome,chrome 90,coremail,management,0
1,access:test_d:20180101121524:OBSg,2018/1/1 12:15,test_d,login,qr,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,order-mgnt,sales,0
2,access:test_d:20180101151333:BpQN,2018/1/1 15:13,test_d,login,qr,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,chrome,chrome 90,order-mgnt,sales,0
3,access:test_d:20180101124502:hYQm,2018/1/1 12:45,test_d,sso,,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,oa,management,0
4,access:test_d:20180101202749:FkDK,2018/1/1 20:27,test_d,sso,,192.168.100.101,内网,1级,"{""first_lvl"":""成都分公司"",""sec_lvl"":""9楼"",""third_lvl...",web,desktop,think_pad_e460,windows,windows 10,edge,edge 93,order-mgnt,sales,0


In [13]:
types_dict = {
    'risk_label' : train['risk_label'].unique(),
    'ip_risk_level': train['ip_risk_level'].unique(),
    'ip_location_type_keyword': train['ip_location_type_keyword'].unique(),
    'action': train['action'].unique(),
    'browser_source': train['browser_source'].unique(),
    'client_type': train['client_type'].unique(),
}
types_dict

{'risk_label': array([0, 1]),
 'ip_risk_level': array(['1级', '2级', '3级'], dtype=object),
 'ip_location_type_keyword': array(['内网', '家庭宽带', '代理IP'], dtype=object),
 'action': array(['login', 'sso'], dtype=object),
 'browser_source': array(['desktop'], dtype=object),
 'client_type': array(['web'], dtype=object)}

In [14]:
data = pd.concat([train, test])
data.shape

(25016, 19)

In [20]:
data['location_first_lvl'] = data['location'].astype(str).map(lambda x: json.loads(x)['first_lvl'])
data['location_sec_lvl'] = data['location'].astype(str).map(lambda x: json.loads(x)['sec_lvl'])
data['location_third_lvl'] = data['location'].astype(str).map(lambda x: json.loads(x)['third_lvl'])

In [24]:
for col in data.columns:
    print(col + ':' +str(data[col].isna().mean()))

session_id:0.0
op_date:0.0
user_name:0.0
action:0.0
auth_type:0.5036776463063639
ip:0.0
ip_location_type_keyword:0.0
ip_risk_level:0.0
location:0.0
client_type:0.0
browser_source:0.0
device_model:0.0
os_type:0.0
os_version:0.0
browser_type:0.0
browser_version:0.0
bus_system_code:0.0
op_target:0.0
risk_label:0.3997441637352095
location_first_lvl:0.0
location_sec_lvl:0.0
location_third_lvl:0.0


In [35]:
data['auth_type'].fillna('__NaN__', inplace=True)
data.drop(['client_type', 'browser_source'], axis=1, inplace=True)

In [31]:
for col in ['user_name', 'action', 'auth_type', 'ip',
                 'ip_location_type_keyword', 'ip_risk_level', 'location', 'device_model',
                 'os_type', 'os_version', 'browser_type', 'browser_version',
                 'bus_system_code', 'op_target', 'location_first_lvl', 'location_sec_lvl',
                 'location_third_lvl']:
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col])

In [36]:
data['op_date'] = pd.to_datetime(data['op_date'])
data['op_ts'] = data["op_date"].values.astype(np.int64) // 10 ** 9 # 操作时间戳信息
data = data.sort_values(by=['user_name', 'op_ts']).reset_index(drop=True)
data['last_ts'] = data.groupby(['user_name'])['op_ts'].shift(1) # 最近一次操作时间戳信息
data['ts_diff1'] = data['op_ts'] - data['last_ts'] # 操作时间戳间隔

In [40]:
# 登录ip、登录地、登录设备、登录OS版本、登录浏览器版本类型不一致的数量
for f in ['ip', 'location', 'device_model', 'os_version', 'browser_version']:
    data[f'user_{f}_nunique'] = data.groupby(['user_name'])[f].transform('nunique')

In [41]:
data.head()

Unnamed: 0,session_id,op_date,user_name,action,auth_type,ip,ip_location_type_keyword,ip_risk_level,location,device_model,os_type,os_version,browser_type,browser_version,bus_system_code,op_target,risk_label,location_first_lvl,location_sec_lvl,location_third_lvl,op_ts,last_ts,ts_diff1,user_ip_nunique,user_location_nunique,user_device_model_nunique,user_os_version_nunique,user_browser_version_nunique,ts_diff1_mean_user_name,ts_diff1_mean_ip,ts_diff1_mean_location,ts_diff1_mean_device_model,ts_diff1_mean_os_version,ts_diff1_mean_browser_version,ts_diff1_max_user_name,ts_diff1_max_ip,ts_diff1_max_location,ts_diff1_max_device_model,ts_diff1_max_os_version,ts_diff1_max_browser_version,ts_diff1_min_user_name,ts_diff1_min_ip,ts_diff1_min_location,ts_diff1_min_device_model,ts_diff1_min_os_version,ts_diff1_min_browser_version,ts_diff1_std_user_name,ts_diff1_std_ip,ts_diff1_std_location,ts_diff1_std_device_model,ts_diff1_std_os_version,ts_diff1_std_browser_version,ts_diff1_sum_user_name,ts_diff1_sum_ip,ts_diff1_sum_location,ts_diff1_sum_device_model,ts_diff1_sum_os_version,ts_diff1_sum_browser_version,ts_diff1_median_user_name,ts_diff1_median_ip,ts_diff1_median_location,ts_diff1_median_device_model,ts_diff1_median_os_version,ts_diff1_median_browser_version,ts_diff1_prod_user_name,ts_diff1_prod_ip,ts_diff1_prod_location,ts_diff1_prod_device_model,ts_diff1_prod_os_version,ts_diff1_prod_browser_version
0,access:test_a:20180107090803:WZc0,2018-01-07 09:08:00,0,0,1,2,1,0,3,1,1,1,1,3,5,0,0.0,1,1,3,1515316080,,,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,23323.71,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,14983200.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,314087.243,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,230321640.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf
1,access:test_a:20180107092734:9863,2018-01-07 09:27:00,0,0,2,2,1,0,3,1,1,1,0,1,5,0,1.0,1,1,3,1515317220,1515316080.0,1140.0,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,29514.917,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,21636900.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,412996.74,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,291961560.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf
2,access:test_a:20180107093226:6qbJ,2018-01-07 09:32:00,0,0,3,2,1,0,3,1,1,1,1,3,4,3,0.0,1,1,3,1515317520,1515317220.0,300.0,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,23323.71,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,14983200.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,314087.243,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,230321640.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf
3,access:test_a:20180107093834:QJrw,2018-01-07 09:38:00,0,1,0,2,1,0,3,1,1,1,0,1,5,0,0.0,1,1,3,1515317880,1515317520.0,360.0,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,29514.917,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,21636900.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,412996.74,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,291961560.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf
4,access:test_a:20180107094058:vfu7,2018-01-07 09:40:00,0,0,1,2,1,0,3,1,1,1,1,3,3,2,0.0,1,1,3,1515318000,1515317880.0,120.0,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,23323.71,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,14983200.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,314087.243,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,230321640.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf


In [42]:
# 对用户名、ip、登录地、登录设备、登录系统、登录浏览器版本分组后求数学特征
for method in ['mean', 'max', 'min', 'std', 'sum', 'median','prod']:
    for col in ['user_name', 'ip', 'location', 'device_model', 'os_version', 'browser_version']:
        data[f'ts_diff1_{method}_' + str(col)] = data.groupby(col)['ts_diff1'].transform(method)

In [43]:
data.head()

Unnamed: 0,session_id,op_date,user_name,action,auth_type,ip,ip_location_type_keyword,ip_risk_level,location,device_model,os_type,os_version,browser_type,browser_version,bus_system_code,op_target,risk_label,location_first_lvl,location_sec_lvl,location_third_lvl,op_ts,last_ts,ts_diff1,user_ip_nunique,user_location_nunique,user_device_model_nunique,user_os_version_nunique,user_browser_version_nunique,ts_diff1_mean_user_name,ts_diff1_mean_ip,ts_diff1_mean_location,ts_diff1_mean_device_model,ts_diff1_mean_os_version,ts_diff1_mean_browser_version,ts_diff1_max_user_name,ts_diff1_max_ip,ts_diff1_max_location,ts_diff1_max_device_model,ts_diff1_max_os_version,ts_diff1_max_browser_version,ts_diff1_min_user_name,ts_diff1_min_ip,ts_diff1_min_location,ts_diff1_min_device_model,ts_diff1_min_os_version,ts_diff1_min_browser_version,ts_diff1_std_user_name,ts_diff1_std_ip,ts_diff1_std_location,ts_diff1_std_device_model,ts_diff1_std_os_version,ts_diff1_std_browser_version,ts_diff1_sum_user_name,ts_diff1_sum_ip,ts_diff1_sum_location,ts_diff1_sum_device_model,ts_diff1_sum_os_version,ts_diff1_sum_browser_version,ts_diff1_median_user_name,ts_diff1_median_ip,ts_diff1_median_location,ts_diff1_median_device_model,ts_diff1_median_os_version,ts_diff1_median_browser_version,ts_diff1_prod_user_name,ts_diff1_prod_ip,ts_diff1_prod_location,ts_diff1_prod_device_model,ts_diff1_prod_os_version,ts_diff1_prod_browser_version
0,access:test_a:20180107090803:WZc0,2018-01-07 09:08:00,0,0,1,2,1,0,3,1,1,1,1,3,5,0,0.0,1,1,3,1515316080,,,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,23323.71,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,14983200.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,314087.243,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,230321640.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf
1,access:test_a:20180107092734:9863,2018-01-07 09:27:00,0,0,2,2,1,0,3,1,1,1,0,1,5,0,1.0,1,1,3,1515317220,1515316080.0,1140.0,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,29514.917,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,21636900.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,412996.74,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,291961560.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf
2,access:test_a:20180107093226:6qbJ,2018-01-07 09:32:00,0,0,3,2,1,0,3,1,1,1,1,3,4,3,0.0,1,1,3,1515317520,1515317220.0,300.0,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,23323.71,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,14983200.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,314087.243,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,230321640.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf
3,access:test_a:20180107093834:QJrw,2018-01-07 09:38:00,0,1,0,2,1,0,3,1,1,1,0,1,5,0,0.0,1,1,3,1515317880,1515317520.0,360.0,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,29514.917,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,21636900.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,412996.74,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,291961560.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf
4,access:test_a:20180107094058:vfu7,2018-01-07 09:40:00,0,0,1,2,1,0,3,1,1,1,1,3,3,2,0.0,1,1,3,1515318000,1515317880.0,120.0,5,5,4,4,8,10595.392,26796.424,26796.424,26421.976,26421.976,23323.71,3232920.0,21636900.0,21636900.0,21636900.0,21636900.0,14983200.0,120.0,120.0,120.0,120.0,120.0,120.0,104000.289,379514.305,379514.305,366936.304,366936.304,314087.243,94140060.0,474243120.0,474243120.0,522283200.0,522283200.0,230321640.0,840.0,840.0,840.0,840.0,840.0,840.0,inf,inf,inf,inf,inf,inf


In [44]:
train = data[data['risk_label'].notna()]
test = data[data['risk_label'].isna()]

print(train.shape, test.shape)

(15016, 70) (10000, 70)


In [45]:
ycol = 'risk_label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'session_id', 'op_date', 'last_ts'], train.columns))
feature_names

['user_name',
 'action',
 'auth_type',
 'ip',
 'ip_location_type_keyword',
 'ip_risk_level',
 'location',
 'device_model',
 'os_type',
 'os_version',
 'browser_type',
 'browser_version',
 'bus_system_code',
 'op_target',
 'location_first_lvl',
 'location_sec_lvl',
 'location_third_lvl',
 'op_ts',
 'ts_diff1',
 'user_ip_nunique',
 'user_location_nunique',
 'user_device_model_nunique',
 'user_os_version_nunique',
 'user_browser_version_nunique',
 'ts_diff1_mean_user_name',
 'ts_diff1_mean_ip',
 'ts_diff1_mean_location',
 'ts_diff1_mean_device_model',
 'ts_diff1_mean_os_version',
 'ts_diff1_mean_browser_version',
 'ts_diff1_max_user_name',
 'ts_diff1_max_ip',
 'ts_diff1_max_location',
 'ts_diff1_max_device_model',
 'ts_diff1_max_os_version',
 'ts_diff1_max_browser_version',
 'ts_diff1_min_user_name',
 'ts_diff1_min_ip',
 'ts_diff1_min_location',
 'ts_diff1_min_device_model',
 'ts_diff1_min_os_version',
 'ts_diff1_min_browser_version',
 'ts_diff1_std_user_name',
 'ts_diff1_std_ip',
 'ts_di

In [46]:
model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=2 ** 8,
                           max_depth=16,
                           learning_rate=0.2,
                           n_estimators=10000,
                           subsample=0.75,
                           feature_fraction=0.55,
                           reg_alpha=0.2,
                           reg_lambda=0.2,
                           random_state=1983,
                           is_unbalance=True,
                           # scale_pos_weight=130,
                           metric='auc')

In [47]:
oof = []
prediction = test[['session_id']]
prediction[ycol] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=20, shuffle=True, random_state=1983)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train.iloc[val_idx][['session_id', ycol]].copy()
    df_oof['pred'] = pred_val[:, 1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[ycol] += pred_test[:, 1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()

df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	train's auc: 0.77898	valid's auc: 0.527369


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	train's auc: 0.810892	valid's auc: 0.523122


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[9]	train's auc: 0.837103	valid's auc: 0.533681


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[79]	train's auc: 0.955192	valid's auc: 0.490235


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	train's auc: 0.875188	valid's auc: 0.50834


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[53]	train's auc: 0.943001	valid's auc: 0.517531


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[6]	train's auc: 0.794904

Unnamed: 0,column,importance
0,op_ts,1449.05
1,ts_diff1,1098.25
2,bus_system_code,507.1
3,auth_type,308.15
4,user_name,185.4
5,ts_diff1_mean_user_name,175.6
6,ts_diff1_sum_user_name,165.6
7,op_target,150.55
8,browser_version,136.75
9,browser_type,117.35


In [48]:
df_oof = pd.concat(oof)
print('roc_auc_score', roc_auc_score(df_oof[ycol], df_oof['pred']))

roc_auc_score 0.5244012833685976


In [49]:
prediction['id'] = range(len(prediction))
prediction['id'] = prediction['id'] + 1
prediction = prediction[['id', 'risk_label']].copy()
prediction.columns = ['id', 'ret']
prediction.head()

Unnamed: 0,id,ret
6147,1,0.302
6148,2,0.428
6149,3,0.42
6150,4,0.437
6151,5,0.409


In [51]:
from time import time
timestamp = str(time())
prediction.to_csv(os.path.join('./result/',timestamp + '.csv'), index=False)