In [None]:
# %%-------------------------------
import warnings
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from datetime import timedelta, datetime
from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from lightgbm import LGBMClassifier
warnings.filterwarnings('ignore')
# %%-------------------------------
print('read data')
df_test = pd.read_csv('data/round1_iflyad_anticheat_testdata_feature.txt', sep='\t')
df_train = pd.read_csv('data/round1_iflyad_anticheat_traindata.txt', sep='\t')
df_uni = pd.concat([df_train, df_test], ignore_index=True)
df_uni['label'] = df_uni['label'].fillna(-1).astype(int)

# %%-------------------------------

cat_cols = ['pkgname', 'ver', 'adunitshowid', 'mediashowid', 'apptype', 'ip',
            'reqrealip', 'city', 'province', 'adidmd5', 'imeimd5', 'idfamd5',
            'openudidmd5', 'macmd5', 'dvctype', 'model', 'make', 'ntt',
            'carrier', 'os', 'osv', 'orientation', 'lan', 'h', 'w', 'ppi']
drop_cols = ['sid', 'label', 'nginxtime']

# %%-------------------------------
print('fill null')
for cat_col in cat_cols:
    if df_uni[cat_col].isnull().sum() > 0:
        df_uni[cat_col].fillna('null_value', inplace=True)


# %%-------------------------------
def gen_value_counts(data, col):
    print('value counts', col)
    df_tmp = pd.DataFrame(data[col].value_counts().reset_index())
    df_tmp.columns = [col, 'tmp']
    r = pd.merge(data, df_tmp, how='left', on=col)['tmp']
    return r.fillna(0)

value_counts_col = ['pkgname', 'adunitshowid', 'ip', 'reqrealip',
                    'adidmd5', 'imeimd5', 'idfamd5', 'macmd5']

for col in value_counts_col:
    df_uni['vc_' + col] = gen_value_counts(df_uni, col)

# %%-------------------------------
print('cut')
def cut_col(data, col_name, cut_list):
    print('cutting', col_name)

    def _trans(array):
        count = array['box_counts']
        for box in cut_list:
            if count <= box:
                return 'count_' + str(box)
        return array[col_name]

    df_counts = pd.DataFrame(data[col_name].value_counts())
    df_counts.columns = ['box_counts']
    df_counts[col_name] = df_counts.index
    df = pd.merge(data, df_counts, on=col_name, how='left')
    column = df.apply(_trans, axis=1)
    return column


cut_col_dict = {
    ('pkgname', 'ver', 'reqrealip', 'adidmd5',
     'imeimd5', 'openudidmd5', 'macmd5', 'model', 'make'): [3],
    ('ip',): [3, 5, 10],
}

for cut_cols, cut_list in cut_col_dict.items():
    for col in cut_cols:
        df_uni[col] = cut_col(df_uni, col, cut_list)

# %%-------------------------------
print('feature time')
df_uni['datetime'] = pd.to_datetime(df_uni['nginxtime'] / 1000, unit='s') + timedelta(hours=8)
df_uni['hour'] = df_uni['datetime'].dt.hour
df_uni['day'] = df_uni['datetime'].dt.day - df_uni['datetime'].dt.day.min()

cat_cols += ['hour']
drop_cols += ['datetime', 'day']

# %%-------------------------------
print('post process')
for col in cat_cols:
    df_uni[col] = df_uni[col].map(dict(zip(df_uni[col].unique(), range(0, df_uni[col].nunique()))))

all_train_index = (df_uni['day'] <= 6).values
train_index     = (df_uni['day'] <= 5).values
valid_index     = (df_uni['day'] == 6).values
test_index      = (df_uni['day'] == 7).values
train_label     = (df_uni['label']).values

for col in drop_cols:
    if col in df_uni.columns:
        df_uni.drop([col], axis=1, inplace=True)

ohe = OneHotEncoder()
mtx_cat = ohe.fit_transform(df_uni[cat_cols])
num_cols = list(set(df_uni.columns).difference(set(cat_cols)))
mtx_num = sparse.csr_matrix(df_uni[num_cols].astype(float).values)
mtx_uni = sparse.hstack([mtx_num, mtx_cat])
mtx_uni = mtx_uni.tocsr()

def col_filter(mtx_train, y_train, mtx_test, func=chi2, percentile=90):
    feature_select = SelectPercentile(func, percentile=percentile)
    feature_select.fit(mtx_train, y_train)
    mtx_train = feature_select.transform(mtx_train)
    mtx_test = feature_select.transform(mtx_test)
    return mtx_train, mtx_test

all_train_x, test_x = col_filter(
    mtx_uni[all_train_index, :],
    train_label[all_train_index],
    mtx_uni[test_index, :]
)


train_x = all_train_x[train_index[:all_train_x.shape[0]], :]
train_y = train_label[train_index]

val_x = all_train_x[valid_index[:all_train_x.shape[0]], :]
val_y = train_label[valid_index]

# %%-------------------------------
print('train')
def lgb_f1(labels, preds):
    score = f1_score(labels, np.round(preds))
    return 'f1', score, True

lgb = LGBMClassifier(random_seed=2019, n_jobs=-1, objective='binary',
                     learning_rate=0.1, n_estimators=4000, num_leaves=64, max_depth=-1,
                     min_child_samples=20, min_child_weight=9, subsample_freq=1,
                     subsample=0.8, colsample_bytree=0.8, reg_alpha=1, reg_lambda=5)

lgb.fit(
    train_x,
    train_y,
    eval_set=[(train_x, train_y), (val_x, val_y)],
    eval_names=['train', 'val'],
    eval_metric=lgb_f1,
    early_stopping_rounds=100,
    verbose=10,
)
print('best score', lgb.best_score_)

# %%-------------------------------
print('predict')
all_train_y = train_label[all_train_index]
lgb.n_estimators = lgb.best_iteration_
lgb.fit(all_train_x, all_train_y)
test_y = lgb.predict(test_x)
df_sub = pd.concat([df_test['sid'], pd.Series(test_y)], axis=1)
df_sub.columns = ['sid', 'label']
df_sub.to_csv('submit-{}.csv'.format(datetime.now().strftime('%m%d_%H%M%S')), sep=',', index=False)

In [None]:
# -*- coding: utf-8 -*-
"""
@author: shaowu
注：此次会详细注释代码，往后都省略。
"""
import pandas as pd
import numpy as np
import time
import tqdm
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from sklearn import preprocessing
from collections import Counter
def one_hot_col(col):
    '''标签编码'''
    lbl = preprocessing.LabelEncoder()
    lbl.fit(col)
    return lbl
def calculate_null(data,key,col):
    '''
    params:
    data -- input data
    key -- the key used for statistics
    col -- the columns for statistics
    return -- the data of DataFrame type, include two columns,
              first columns id key,second is number of null
    '''
    return data.groupby(key,as_index=False)[col].agg({col+'_is_null':'count'})
def xgb_model(new_train,y,new_test,lr):
    '''定义模型'''
    xgb_params = {'booster': 'gbtree',
          'eta':lr, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective':'binary:logistic',
          'eval_metric': 'auc',
          'silent': True,
          }
    #skf=StratifiedKFold(y,n_folds=5,shuffle=True,random_state=2018)
    skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    oof_xgb=np.zeros(new_train.shape[0])
    prediction_xgb=np.zeros(new_test.shape[0])
    for i,(tr,va) in enumerate(skf.split(new_train,y)):
        print('fold:',i+1,'training')
        dtrain = xgb.DMatrix(new_train[tr],y[tr])
        dvalid = xgb.DMatrix(new_train[va],y[va])
        watchlist = [(dtrain, 'train'), (dvalid, 'valid_data')]
        bst = xgb.train(dtrain=dtrain, num_boost_round=30000, evals=watchlist, early_stopping_rounds=200, \
        verbose_eval=50, params=xgb_params)
        oof_xgb[va] += bst.predict(xgb.DMatrix(new_train[va]), ntree_limit=bst.best_ntree_limit)
        prediction_xgb += bst.predict(xgb.DMatrix(new_test), ntree_limit=bst.best_ntree_limit)
    print('the roc_auc_score for train:',roc_auc_score(y,oof_xgb))
    prediction_xgb/=5
    return oof_xgb,prediction_xgb
def lgb_model(new_train,y,new_test):
    params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'num_leaves': 1000,
    'verbose': -1,
    'max_depth': -1,
  #  'reg_alpha':2.2,
  #  'reg_lambda':1.4,
    'seed':42,
    }
    #skf=StratifiedKFold(y,n_folds=5,shuffle=True,random_state=2018)
    skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    oof_lgb=np.zeros(new_train.shape[0]) ##用于存放训练集概率，由每折验证集所得
    prediction_lgb=np.zeros(new_test.shape[0])  ##用于存放测试集概率，k折最后要除以k取平均
    feature_importance_df = pd.DataFrame() ##存放特征重要性，此处不考虑
    for i,(tr,va) in enumerate(skf.split(new_train,y)):
        print('fold:',i+1,'training')
        dtrain = lgb.Dataset(new_train[tr],y[tr])
        dvalid = lgb.Dataset(new_train[va],y[va],reference=dtrain)
        ##训练：
        bst = lgb.train(params, dtrain, num_boost_round=30000, valid_sets=dvalid, verbose_eval=400,early_stopping_rounds=200)
        ##预测验证集：
        oof_lgb[va] += bst.predict(new_train[va], num_iteration=bst.best_iteration)
        ##预测测试集：
        prediction_lgb += bst.predict(new_test, num_iteration=bst.best_iteration)
        '''
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = list(new_train.columns)
        fold_importance_df["importance"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        '''
    
    print('the roc_auc_score for train:',roc_auc_score(y,oof_lgb)) ##线下auc评分
    prediction_lgb/=5
    return oof_lgb,prediction_lgb,feature_importance_df

##读入测试数据：
testdata= pd.read_csv("round1_iflyad_anticheat_testdata_feature.txt",sep='\t')
testdata['label']=-1 ##测试集没有标签，可标记为-1

testdata['begin_time']=testdata['sid'].apply(lambda x:int(x.split('-')[-1])) ##请求会话时间
testdata['nginxtime-begin_time']=testdata['nginxtime']-testdata['begin_time'] ##请求会话时间 与 请求到达服务时间的差

##读入训练数据：
traindata= pd.read_csv("round1_iflyad_anticheat_traindata.txt",sep='\t')

traindata['begin_time']=traindata['sid'].apply(lambda x:int(x.split('-')[-1]))
traindata['nginxtime-begin_time']=traindata['nginxtime']-traindata['begin_time']

##结合数据，方便提取特征：axis=0 纵向合并；axis=1 横向合并
data=pd.concat([traindata,testdata],axis=0).reset_index(drop=True)

print('the shape of data:',data.shpe)

print(data.nunique()) ##返回每个字段的所有值组成集合的大小，即集合元素个数
print(data[:5]) ##输出数据前5行
z=calculate_null(testdata,'sid','ver') ##计算缺失值的，下面还没用到

print('label distribution:\n',traindata['label'].value_counts()) ##查看训练集标签分布

object_cols=list(data.dtypes[data.dtypes==np.object].index) ##返回字段名为object类型的字段
print(data.dtypes[data.dtypes==np.object].index) ##输出object类型的字段

##本题所给时间戳为毫秒级，故需除以1000转换为秒级：时间戳转成日期格式
print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(data['nginxtime'][0]/1000)))

##对object类型的字段进行标签编码：
for col in object_cols:
    if col!='sid':
        data[col]=one_hot_col(data[col].astype(str)).transform(data[col].astype(str))

##划分数据：
train=data[:traindata.shape[0]]
label=train['label'].values
test=data[traindata.shape[0]:].reset_index(drop=True)

##模型训练预测：
oof_lgb,prediction_lgb,feature_importance_df=\
      lgb_model(np.array(train.drop(['sid','label','nginxtime','ip','reqrealip','begin_time'],axis=1)),\
                label,\
                np.array(test.drop(['sid','label','nginxtime','ip','reqrealip','begin_time'],axis=1)))

##保存结果：
sub=test[['sid']]
sub['label']=prediction_lgb
sub['label']=sub['label'].apply(lambda x: 1 if x>0.5 else 0) ##∪概率大于0.5的置1，否则置0
print('test pre_label distribution:\n',sub['label'].value_counts()) ## 模型预测测试集的标签分布
sub.to_csv('submit0704.csv',index=None) ##保存为submit0704.csv文件