# 电信诈骗baseline

1.84166测试样本，202000条训练样本，一共286166

286372,交易流水中出现的账户数量
286069,交易渠道中出现的账户数量

交易渠道一共800多万条数据

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import seaborn as sns
import logging
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit,train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
import scipy.signal as sg
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
data_transa_keep_cols = ['cust_no','tx_dt','biz_type','biz_line','term_curr_ind','cust_acct_num','cust_age','tx_amt_rmb','credit_debit_ind',
                         'tx_method','tx_channel_type','tx_area','tx_nation_cd','cnter_cust_ind','cnter_cust_acct_num','is_cross_tx_ind',
                         'is_cross_bank_ind','is_remote_ind','account_bal'
                         ]

In [None]:
data_static_keep_cols = ['data_dt','cst_id','mo_incmam','comn_login_cnt1','comn_login_cnt2','comn_login_cnt3','ibs_tran_time1','ibs_tran_time2',
                         'ibs_tran_time3','mbs_tran_time1','mbs_tran_time2','mbs_tran_time3','atm_tran_time1','atm_tran_time2','atm_tran_time3'
                        ]

In [None]:
data_label_keep_cols = ['cardno','bad_flag']

In [None]:
#读入数据集,路径一定要正确
data_transa = pd.read_csv('',usecols=data_transa_keep_cols) #交易流水表
data_static = pd.read_csv('',usecols=data_static_keep_cols) #交易渠道表
data_label = pd.read_csv('',usecols=data_label_keep_cols)  #训练集标签，一共202000条客户账号，不到202000个客户编号
test_data = pd.read_csv('')                                #测试集数量，一共84166个样本

In [None]:
#统一各个表中的客户编号的列名，同一称作'cst_id',客户账号统一称做'cardno'
data_transa.rename(columns={'cust_no':'cst_id', 'cust_acct_num':'cardno'}, inplace=True)
data_transa.fillna(0,inplace=True)
data_static.fillna(0,inplace=True)
test_data.columns = ['cardno','cst_id','bad_flag','data_type']

In [None]:
#从data_transa提取各个客户账号的特征

def get_feature_from_transa_one_account(data_sample: DataFrame):
    feature_dict = {
        'cardno':data_sample['cardno'].iloc[0],
        'cst_id':data_sample['cst_id'].iloc[0]
    }
    
    #总共的交易天数
    feature_dict['jy_day_cnt'] = data_sample['tx_dt'].nunique()
    #多少业务种类
    feature_dict['biz_type_cnt'] = data_sample['biz_type'].nunique()
    #多少业务条线
    feature_dict['biz_line_cnt'] = data_sample['biz_line'].nunique()
    #账户年龄
    feature_dict['nl'] = data_sample['cust_age'].max()
    
    #总入(出)帐金额，以及占总金额的比例
    feature_dict['in_amt'] = data_sample[data_sample['credit_debit_ind'] == '1']['tx_amt_rmb'].sum()
    feature_dict['out_amt'] = data_sample[data_sample['credit_debit_ind'] == '0']['tx_amt_rmb'].sum()
    feature_dict['in_amt_ratio'] = data_sample[data_sample['credit_debit_ind'] == '1']['tx_amt_rmb'].sum() / data_sample['tx_amt_rmb'].sum()

    #总入(出)帐次数，以及占总次数的比例
    feature_dict['in_opt_cnt'] = data_sample[data_sample['credit_debit_ind'] == '1'].shape[0]
    feature_dict['out_opt_cnt'] = data_sample[data_sample['credit_debit_ind'] == '0'].shape[0]
    feature_dict['in_opt_ratio'] = data_sample[data_sample['credit_debit_ind'] == '1'].shape[0] / data_sample.shape[0]
    
    opt_names = ['max','min','mean']

    #入帐,出账，余额最大值、最小值、均值
    for op in opt_names:
        feature_dict['in_je_{}'.format(op.capitalize())] = data_sample[data_sample['credit_debit_ind'] == '1']['tx_amt_rmb'].agg(op)
        feature_dict['out_je_{}'.format(op.capitalize())] = data_sample[data_sample['credit_debit_ind'] == '0']['tx_amt_rmb'].agg(op)
        feature_dict['ye_{}'.format(op.capitalize())] = data_sample['account_bal'].agg(op)

    #定活存交易数,占比
    feature_dict['term_opt_cnt'] = data_sample[data_sample['term_curr_ind']=='1'].shape[0]
    feature_dict['curr_opt_cnt'] = data_sample[data_sample['term_curr_ind']=='0'].shape[0]
    feature_dict['term_opt_ratio'] = data_sample[data_sample['term_curr_ind']=='1'].shape[0] / data_sample.shape[0]

    #多少种交易方式，交易渠道、交易发生地、交易国
    feature_dict['jy_method_cnt'] = data_sample['tx_method'].nunique()
    feature_dict['jy_qd_cnt'] = data_sample['tx_channel_type'].nunique()
    feature_dict['jy_area_cnt'] = data_sample['tx_area'].nunique()
    feature_dict['jy_nation_cnt'] = data_sample['tx_nation_cd'].nunique()

    #与多少个不同的对方账户转账过，收到过多少不同账户的转账,占比
    feature_dict['in_dfzh_tx_cnt'] = data_sample[data_sample['credit_debit_ind'] == '1']['cnter_cust_acct_num'].nunique()
    feature_dict['out_dfzh_tx_cnt'] = data_sample[data_sample['credit_debit_ind'] == '0']['cnter_cust_acct_num'].nunique()
    feature_dict['in_dfzh_ration'] = data_sample[data_sample['credit_debit_ind'] == '1']['cnter_cust_acct_num'].nunique() / data_sample['cnter_cust_acct_num'].nunique()

    #对公对私交易数量，及占比
    feature_dict['cnter_opt_cnt'] = data_sample[data_sample['cnter_cust_ind']=='1'].shape[0]
    feature_dict['cust_opt_cnt'] = data_sample[data_sample['cnter_cust_ind']=='0'].shape[0]
    feature_dict['cnter_opt_ratio'] = data_sample[data_sample['cnter_cust_ind']=='1'].shape[0] / data_sample.shape[0]

    #跨境、跨行、异地交易数，金额，及其占比
    is_col = ['cross_tx', 'cross_bank', 'remote']

    for col in is_col:
        feature_dict['{}_opt_cnt'.format(col)] = data_sample[data_sample['is_{}_ind'.format(col)]=='1'].shape[0]
        feature_dict['{}_opt_ratio'.format(col)] = data_sample[data_sample['is_{}_ind'.format(col)]=='1'].shape[0] / data_sample.shape[0]
        feature_dict['{}_amt'.format(col)] = data_sample[data_sample['is_{}_ind'.format(col)]=='1']['tx_amt_rmb'].sum()
        feature_dict['{}_amt_ratio'.format(col)] = data_sample[data_sample['is_{}_ind'.format(col)]=='1']['tx_amt_rmb'].sum() / data_sample['tx_amt_rmb'].sum()

    return pd.DataFrame([feature_dict])

In [None]:
#从data_static提取各个客户编号的特征

def get_feature_from_static_one_account(data_sample: DataFrame):
    feature_dict = {}

    data_dict = data_sample.sort_values(by = 'data_dt', ascending=False).reset_index(drop=True).iloc[0].to_dict()

    data_dict.pop('data_dt')

    feature_dict.update(data_dict)

    return pd.DataFrame([feature_dict])

In [None]:
#从data_transa里面提取每个客户账号的相应特征
card_transa_features = data_transa.groupby('cardno').apply(get_feature_from_transa_one_account).reset_index(drop=True)

In [None]:
#从data_static里面提取每个账户
cst_static_features = data_static.groupby('cst_id').apply(get_feature_from_static_one_account).reset_index(drop=True)

In [None]:
#合并两种特征
card_features = pd.merge(card_transa_features, cst_static_features, on='cst_id', how='left')
card_features.fillna(0, inplace=True)

In [None]:
#将诈骗标志加入进来
card_features = pd.merge(card_features, data_label[['cardno','bad_flag']], on='cardno', how='left')

In [None]:
#检查一下
print(card_features.shape)
print(card_features['bad_flag'].notnull().sum())

## 搭建模型

In [None]:
def train_lgb(X_train, y_train, X_val, y_val, X_test, cate_feats=None):

    logging.info(f'#########training lgb....############')

    dtrain = lgb.Dataset(X_train, y_train, categorical_feature=cate_feats)
    dvalid = lgb.Dataset(X_val, y_val, categorical_feature=cate_feats, reference=dtrain)

    params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.01,
                'seed': 2020,
                'n_jobs':8
            }
    callbacks = [log_evaluation(period=200), early_stopping(stopping_rounds=200)]#stopping_rounds=30指如果验证集的误差在30次迭代内没有降低，则停止迭代。
    gbm = lgb.train(params, dtrain, 10000, valid_sets=[dtrain, dvalid], 
                            categorical_feature=[], callbacks=callbacks)
        
    val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    
    roc_auc_scores = roc_auc_score(y_val, val_pred)
    print('auc_score:',roc_auc_scores)
    
    test_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    
    return gbm, test_pred

In [None]:
test_card_names = test_data['cardno'].to_list()

In [None]:
df_test = card_features[card_features['cardno'].isin(test_card_names)].reset_index(drop=True)
df_test.set_index('cardno',inplace=True)
df_test = df_test.reindex(test_card_names).reset_index()

In [None]:
df_train = card_features[card_features['bad_flag'].notnull()].reset_index(drop = True).iloc[:,2:] #去掉cardno和cst_id一列，保留bad_flag


X_train_set,X_val_set = train_test_split(df_train, stratify=df_train['bad_flag'], test_size=0.2, random_state=1024)
X_train_set = X_train_set.reset_index(drop=True)
X_val_set = X_val_set.reset_index(drop=True)

X_train,y_train,X_val,y_val = X_train_set.iloc[:,:-1],X_train_set.iloc[:,-1],X_val_set.iloc[:,:-1],X_val_set.iloc[:,-1]

X_test = df_test.iloc[:,2:-1]

g,test_pred = train_lgb(X_train, y_train, X_val, y_val, X_test)

test_result = pd.DataFrame(
    {
        'cardno':test_data['cardno'].values,
        'pred':test_pred
    }
)

In [None]:
test_result.to_csv('', encoding='utf_8_sig', index=False) #这里填写自己要保存到的路径

In [None]:
card_features.to_csv('', encoding='utf_8_sig', index=False)#保存提取的特征