In [2]:
# 第二版：通过catboost做个简单的二分类来判断是否作弊，提交后得分为：94.33074

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

class Processing():
    #黑名单获取
    def _get_blacklist(self,train):
        cheat = train[train['label']==1] 
        noCheat = train[train['label']==0] 
        blacklist_dic = {}
        for f in ['adidmd5','imeimd5']:
            w_s = []
            s = set(cheat[f])
            blacklist_dic[f] = s
        return blacklist_dic
    #特征工程 
    def _feature_eng(self,train,test):
        features = ['pkgname', 'ver', 'adunitshowid', 'mediashowid', 'apptype', 'adidmd5', 'imeimd5', 'ip','macmd5', 'openudidmd5',
            'reqrealip', 'city', 'province', 'idfamd5', 'dvctype', 'model', 'make', 'ntt',
            'carrier', 'os', 'osv', 'orientation', 'lan', 'h', 'w', 'ppi']
        train_test = pd.concat([train, test], ignore_index=True,sort=True)
        train_test['label'] = train_test['label'].fillna(-1).astype(int)
        train_test['time'] = pd.to_datetime(train_test['nginxtime'] , unit='ms')
        train_test['hour'] = train_test['time'].dt.hour.astype('str')
        train_test.fillna('null_value',inplace = True)
        features.append('hour')
        new_test = train_test[train_test['label'] == -1]
        new_train = train_test[train_test['label'] != -1]
        return new_train,new_test,features
    

    
    

class TrainModels():  
    #黑名单作弊判断
    def _judge_black(self,blacklist_dic,test):
        judge_cheat_sid = set()
        judge_features = list(blacklist_dic.keys())
        judge_df = test[judge_features+['sid']]
        judge_df['label'] = [0]*len(judge_df)
        for f in judge_features:
            s = blacklist_dic[f]
            judge_df['label'] = judge_df.apply(lambda x: 1 if (x[f] in s or x['label'] == 1) else 0,axis=1)
        return judge_df[['sid','label']]
    
    #利用catboost做二分类
    def _judge_catboost(self,train,test,features):    
        model = CatBoostClassifier(iterations=946, depth=8,cat_features=features,learning_rate=0.05, custom_metric='F1',eval_metric='F1',random_seed=2019,
                            l2_leaf_reg=5.0,logging_level='Silent')
        model.fit(train[features],train['label'])
        y_pred = model.predict(test[features]).tolist()
        
        judge_df = pd.DataFrame()
        judge_df['sid'] = test['sid'].tolist()
        judge_df['label'] = y_pred
        judge_df['label'] = judge_df['label'].apply(lambda x: 1 if x>=0.5 else 0)
        return judge_df[['sid','label']]
    
    
    
if __name__ == "__main__":
    train = pd.read_table('round1_iflyad_anticheat_traindata.txt')
    test = pd.read_table('round1_iflyad_anticheat_testdata_feature.txt')    
    proce_module = Processing()
    model_module = TrainModels()
    #黑名单
    blacklist_dic = proce_module._get_blacklist(train)
    judge_by_blackList = model_module._judge_black(blacklist_dic,test)
    judge_by_blackList.to_csv('judge_by_blackList.csv',index=False,encoding='utf-8')
    #二分类---使用catboost
    new_train,new_test,features= proce_module._feature_eng(train,test)
    print('ok')
    judge_by_catboost = model_module._judge_catboost(new_train,new_test,features)   
    judge_by_catboost.to_csv('judge_by_catboost.csv',index=False,encoding='utf-8')

In [None]:
对大量类别特征进行了Count/Rank编码处理
对长尾分布的特征有较好的表达
清洗了Model及Make,处理了长尾分布
全部大写
替换了url标识符
使用了Catboost建模
适用于Category类型较多的场景

In [None]:
# MindRank.ai

import numpy as np
import pandas as pd
from pandas import DataFrame as DF
import scipy.spatial.distance as dist
import xgboost as xgb
import lightgbm as lgb
import catboost as cbt
import json
from sklearn.metrics import f1_score
import time
import gc
import math
from tqdm import tqdm
from scipy import stats
from sklearn.cluster import KMeans
from six.moves import reduce
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import matplotlib.pyplot as plt
from datetime import datetime,timedelta

import warnings
warnings.filterwarnings('ignore')

get_ipython().run_line_magic('matplotlib', 'inline')

train = pd.read_table("../input/round1_iflyad_anticheat_traindata.txt")
test = pd.read_table("../input/round1_iflyad_anticheat_testdata_feature.txt")
all_data = train.append(test).reset_index(drop=True)

# 对时间的处理
all_data['time'] = pd.to_datetime(all_data['nginxtime']*1e+6) + timedelta(hours=8)
all_data['day'] = all_data['time'].dt.dayofyear
all_data['hour'] = all_data['time'].dt.hour

# Data Clean
# 全部变成大写，防止oppo 和 OPPO 的出现
all_data['model'].replace('PACM00',"OPPO R15",inplace=True)
all_data['model'].replace('PBAM00',"OPPO A5",inplace=True)
all_data['model'].replace('PBEM00',"OPPO R17",inplace=True)
all_data['model'].replace('PADM00',"OPPO A3",inplace=True)
all_data['model'].replace('PBBM00',"OPPO A7",inplace=True)
all_data['model'].replace('PAAM00',"OPPO R15_1",inplace=True)
all_data['model'].replace('PACT00',"OPPO R15_2",inplace=True)
all_data['model'].replace('PABT00',"OPPO A5_1",inplace=True)
all_data['model'].replace('PBCM10',"OPPO R15x",inplace=True)

for fea in ['model','make','lan']:
    all_data[fea] = all_data[fea].astype('str')
    all_data[fea] = all_data[fea].map(lambda x:x.upper())

    from urllib.parse import unquote

    def url_clean(x):
        x = unquote(x,'utf-8').replace('%2B',' ').replace('%20',' ').replace('%2F','/').replace('%3F','?').replace('%25','%').replace('%23','#').replace(".",' ').replace('??',' ').\
                               replace('%26',' ').replace("%3D",'=').replace('%22','').replace('_',' ').replace('+',' ').replace('-',' ').replace('__',' ').replace('  ',' ').replace(',',' ')
        
        if (x[0]=='V') & (x[-1]=='A'):
            return "VIVO {}".format(x)
        elif (x[0]=='P') & (x[-1]=='0'):
            return "OPPO {}".format(x)
        elif (len(x)==5) & (x[0]=='O'):
            return "Smartisan {}".format(x)
        elif ('AL00' in x):
            return "HW {}".format(x)
        else:
            return x

    all_data[fea] = all_data[fea].map(url_clean)
    
all_data['big_model'] = all_data['model'].map(lambda x:x.split(' ')[0])
all_data['model_equal_make'] = (all_data['big_model']==all_data['make']).astype(int)

# H,W,PPI

all_data['size'] = (np.sqrt(all_data['h']**2 + all_data['w'] ** 2) / 2.54) / 1000
all_data['ratio'] = all_data['h'] / all_data['w']
all_data['px'] = all_data['ppi'] * all_data['size']
all_data['mj'] = all_data['h'] * all_data['w']

num_col = ['h','w','size','mj','ratio','px']
cat_col = [i for i in all_data.select_dtypes(object).columns if i not in ['sid','label']]
both_col = []

for i in tqdm(cat_col):
    lbl = LabelEncoder()
    all_data[i+"_count"] = all_data.groupby([i])[i].transform('count')
    all_data[i+"_rank"] = all_data[i+"_count"].rank(method='min')
    all_data[i] = lbl.fit_transform(all_data[i].astype(str))
    both_col.extend([i+"_count",i+"_rank"])

for i in tqdm(['h','w','ppi','ratio']):
    all_data['{}_count'.format(i)] = all_data.groupby(['{}'.format(i)])['sid'].transform('count')
    all_data['{}_rank'.format(i)] = all_data['{}_count'.format(i)].rank(method='min')

feature_name = [i for i in all_data.columns if i not in ['sid','label','time']]
cat_list = [i for i in train.columns if i not in ['sid','label','nginxtime']]

from sklearn.metrics import roc_auc_score

tr_index = ~all_data['label'].isnull()
X_train = all_data[tr_index][list(set(feature_name))].reset_index(drop=True)
y = all_data[tr_index]['label'].reset_index(drop=True).astype(int)
X_test = all_data[~tr_index][list(set(feature_name))].reset_index(drop=True)
print(X_train.shape,X_test.shape)
random_seed = 2019
final_pred = []
cv_score = []
cv_model = []
skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    print(index)
    train_x, test_x, train_y, test_y = X_train[feature_name].iloc[train_index], X_train[feature_name].iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    cbt_model = cbt.CatBoostClassifier(iterations=3000,learning_rate=0.05,max_depth=11,l2_leaf_reg=1,verbose=10,early_stopping_rounds=400,task_type='GPU',eval_metric='F1',cat_features=cat_list)
    cbt_model.fit(train_x[feature_name], train_y,eval_set=(test_x[feature_name],test_y))
    cv_model.append(cbt_model)
    y_test = cbt_model.predict(X_test[feature_name])
    y_val = cbt_model.predict_proba(test_x[feature_name])
    print(Counter(np.argmax(y_val,axis=1)))
    cv_score.append(f1_score(test_y,np.round(y_val[:,1])))

# Catboost比较适合类别较多的场景
    
# GPU结果五折 
# 第一折
# bestTest = 0.94051
# bestIteration = 1512

fi = []
for i in cv_model:
    tmp = {
        'name' : feature_name,
        'score' : i.feature_importances_
    }
    fi.append(pd.DataFrame(tmp))
    
fi = pd.concat(fi)
fig = plt.figure(figsize=(8,8))
fi.groupby(['name'])['score'].agg('mean').sort_values(ascending=False).head(40).plot.barh()

cv_pred = np.zeros((X_train.shape[0],))
test_pred = np.zeros((X_test.shape[0],))
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    print(index)
    train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    y_val = cv_model[index].predict_proba(test_x[feature_name])[:,1]
    print(y_val.shape)
    cv_pred[test_index] = y_val
    test_pred += cv_model[index].predict_proba(X_test[feature_name])[:,1] / 5

print("CV score: ",np.mean(cv_score))

submit = test[['sid']]
submit['label'] = (test_pred>=0.5).astype(int)
print(submit['label'].value_counts())
submit.to_csv("submission.csv",index=False)

In [None]:
# LGB+Catboost+卡方校验 94.5

In [None]:

# -*- coding: utf-8 -*-
# @Time    : 2019/7/25 20:47
# @Author  : YYLin
# @Email   : 854280599@qq.com
# @File    : My_Method_For_LGB.py
# 本版本的变化有三个   删除了一些缺失值较多的属性 以及认为不重要的属性
# 增加一些混合属性    关于IP地址 app类型和广告位之间的数据统计
# 最后显示了数据集中哪些特征是重要特征
import warnings
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import f1_score
from datetime import timedelta, datetime
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
 
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
 
# 定义训练使用的数据列
train_cols = ['ip', 'apptype', 'model', 'os', 'adunitshowid', 'mediashowid', 'nginxtime', 'label', 'sid']
test_cols = ['ip', 'apptype', 'model', 'os', 'adunitshowid', 'mediashowid', 'nginxtime', 'sid']
 
# 读取训练集以及测试集 并进行拼接操作
df_train = pd.read_csv('data/traindata.txt', sep='\t', usecols=train_cols)
df_test = pd.read_csv('data/testdata.txt', sep='\t', usecols=test_cols)
 
All_data_for_train = pd.concat([df_train, df_test], ignore_index=True).drop(columns='sid')
All_data_for_train['label'] = All_data_for_train['label'].fillna(-1).astype(int)
# print('检查一下数据是否读取正确:\n', All_data_for_train.head(5))
 
# 在训练集中增加 day hour mintue
All_data_for_train['datetime'] = pd.to_datetime(All_data_for_train['nginxtime'] / 1000, unit='s') + timedelta(hours=8)
All_data_for_train['hour'] = All_data_for_train['datetime'].dt.hour
All_data_for_train['day'] = All_data_for_train['datetime'].dt.day - All_data_for_train['datetime'].dt.day.min()
All_data_for_train['minute'] = All_data_for_train['datetime'].dt.minute.astype('uint8')
All_data_for_train.drop(['nginxtime'], axis=1, inplace=True)
# print('检查一下数据中的时间信息是否正确:\n', All_data_for_train.head(5))
 
# 对model---设备进行处理
All_data_for_train['model'].replace('PACM00', "OPPO R15", inplace=True)
All_data_for_train['model'].replace('PBAM00', "OPPO A5", inplace=True)
All_data_for_train['model'].replace('PBEM00', "OPPO R17", inplace=True)
All_data_for_train['model'].replace('PADM00', "OPPO A3", inplace=True)
All_data_for_train['model'].replace('PBBM00', "OPPO A7", inplace=True)
All_data_for_train['model'].replace('PAAM00', "OPPO R15_1", inplace=True)
All_data_for_train['model'].replace('PACT00', "OPPO R15_2", inplace=True)
All_data_for_train['model'].replace('PABT00', "OPPO A5_1", inplace=True)
All_data_for_train['model'].replace('PBCM10', "OPPO R15x", inplace=True)
 
# 处理属性中出现的大小写问题
All_data_for_train['model'] = All_data_for_train['model'].astype('str')
All_data_for_train['model'] = All_data_for_train['model'].map(lambda x: x.upper())
All_data_for_train['os'] = All_data_for_train['os'].astype('str')
All_data_for_train['os'] = All_data_for_train['os'].map(lambda x: x.upper())
 
# 统计属性列中单个属性出现次数 以及对结果进行排序 可以发现统计列的值比较的大
# 版本分为两个 一个是使用单独属性 一个是不使用单独属性分别测试
print('loading single attributes ...........\n')
cols_for_single_atr = ['ip', 'apptype', 'model', 'os', 'adunitshowid', 'mediashowid']
for i in tqdm(cols_for_single_atr):
    lbl = LabelEncoder()
    All_data_for_train[i + "_count"] = All_data_for_train.groupby([i])[i].transform('count')
    All_data_for_train[i + "_rank"] = All_data_for_train[i + "_count"].rank(method='min')
    All_data_for_train[i] = lbl.fit_transform(All_data_for_train[i].astype(str))
# print('使用groupby之后数据的信息是:\n', All_data_for_train.head(5))
 
# 开始统计一些复合属性
# 第一列统计的是： 通过什么app访问的这个广告
# 第二列统计的是： 那个IP地址访问了这个广告
print('\nloading Fusion_attributes........\n')
Fusion_attributes = ['apptype_adunitshowid', 'apptype_adunitshowid_mediashowid', 'apptype_mediashowid', 'apptype_adunitshowid_model_day_hour',
                     'apptype_model_day_hour', 'apptype_os_adunitshowid_day_hour',
 
                     'ip_day', 'ip_apptype_model_adunitshowid_day', 'ip_apptype_model_day', 'ip_apptype_model_day_os_hour',
                     'ip_apptype_os_adunitshowid', 'ip_os', 'ip_apptype_os_adunitshowid_day']
 
for attribute in tqdm(Fusion_attributes):
    name = "count_" + attribute
    dummy = 'label'
    cols = attribute.split("_")
    cols_with_dummy = cols.copy()
    cols_with_dummy.append(dummy)
    gp = All_data_for_train[cols_with_dummy].groupby(by=cols)[[dummy]].count().reset_index().rename(index=str, columns={dummy: name})
    All_data_for_train = All_data_for_train.merge(gp, on=cols, how='left')
# print('经过融合属性之后数据中的值是:\n', All_data_for_train.head(5))
 
# 开始统计一些比值信息
print('\n loading Ratio for model: .........\n')
 
All_data_for_train["machine"] = 1000*All_data_for_train["model"] + All_data_for_train["os"]
Ratio_Attributes = ['ip_machine', 'machine_ip', 'apptype_adunitshowid', 'adunitshowid_apptype', 'apptype_mediashowid',
                    'mediashowid_apptype']
for attribute in Ratio_Attributes:
    name = "countRatio_" + attribute
    dummy = 'label'
    cols = attribute.split("_")
    cols_with_dummy = cols.copy()
    cols_with_dummy.append(dummy)
 
    # 进行属性比值的融合
    gp1 = All_data_for_train[cols_with_dummy].groupby(by=cols)[[dummy]].count().reset_index().rename(index=str, columns={dummy: 'cnt1'})
    _df = All_data_for_train.merge(gp1, on=cols, how='left')
    gp2 = All_data_for_train[cols].groupby(by=cols[0:len(cols) - 1])[[cols[len(cols) - 1]]].count().reset_index().rename(index=str, columns={cols[len(cols) - 1]: 'cnt2'})
    _df['cnt2'] = All_data_for_train.merge(gp2, on=cols[0:len(cols) - 1], how='left')['cnt2']
 
    All_data_for_train[name] = _df['cnt1'] / _df['cnt2']
# print('经过属性比值融合之后的属性是:', All_data_for_train.head(5))
 
All_data_for_train = All_data_for_train.drop(columns='datetime')
Y_data = All_data_for_train.loc[All_data_for_train['label'] != -1]['label']
X_data = All_data_for_train.loc[All_data_for_train['label'] != -1].drop(columns='label')
# print('训练集中的数据格式:\n', X_data.head(5))
# print('训练集中标签的格式:\n', Y_data.head(5))
 
X_test = All_data_for_train.loc[All_data_for_train['label'] == -1].drop(columns='label')
# print('测试集中的数据格式:\n', X_test.head(5))
 
train_x, val_x, train_y, val_y = train_test_split(X_data, Y_data, test_size=0.2)
 
 
def lgb_f1(labels, preds):
    score = f1_score(labels, np.round(preds))
    return 'f1', score, True
 
 
# 增加了学习率 并且增大了运行的次数
lgb = LGBMClassifier(random_seed=2019, n_jobs=-1, objective='binary',
                     learning_rate=0.1, n_estimators=6500, num_leaves=31, max_depth=-1,
                     min_child_samples=50, min_child_weight=9, subsample_freq=1,
                     subsample=0.7, colsample_bytree=0.7, reg_alpha=1, reg_lambda=5)
 
lgb.fit(
    train_x,
    train_y,
    eval_set=[(train_x, train_y), (val_x, val_y)],
    eval_names=['train', 'val'],
    eval_metric=lgb_f1,
    early_stopping_rounds=400,
    verbose=10,
)
print('best score', lgb.best_score_)
 
# %%-------------------------------
print('predict')
lgb.n_estimators = lgb.best_iteration_
lgb.fit(X_data, Y_data)
test_y = lgb.predict(X_test)
df_sub = pd.concat([df_test['sid'], pd.Series(test_y)], axis=1)
df_sub.columns = ['sid', 'label']
df_sub.to_csv('lgb_submit-{}.csv'.format(datetime.now().strftime('%m%d_%H%M%S')), sep=',', index=False)
 
# 2019 7 25 画图显示模型的重要特征
import matplotlib.pyplot as plt
import seaborn as sns
 
color = sns.color_palette()
sns.set_style('darkgrid')
 
features_list = X_data.columns.values
feature_importance = lgb.feature_importances_
sorted_idx = np.argsort(feature_importance)
 
plt.figure(figsize=(5, 7))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), features_list[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature importances')
plt.draw()
plt.show()

In [None]:
# -*- coding: utf-8 -*-
"""
@author: shaowu
注：此次会详细注释代码，往后都省略。
"""
import pandas as pd
import numpy as np
import time
import tqdm
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from sklearn import preprocessing
from collections import Counter
def one_hot_col(col):
    '''标签编码'''
    lbl = preprocessing.LabelEncoder()
    lbl.fit(col)
    return lbl
def calculate_null(data,key,col):
    '''
    params:
    data -- input data
    key -- the key used for statistics
    col -- the columns for statistics
    return -- the data of DataFrame type, include two columns,
              first columns id key,second is number of null
    '''
    return data.groupby(key,as_index=False)[col].agg({col+'_is_null':'count'})
def xgb_model(new_train,y,new_test,lr):
    '''定义模型'''
    xgb_params = {'booster': 'gbtree',
          'eta':lr, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective':'binary:logistic',
          'eval_metric': 'auc',
          'silent': True,
          }
    #skf=StratifiedKFold(y,n_folds=5,shuffle=True,random_state=2018)
    skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    oof_xgb=np.zeros(new_train.shape[0])
    prediction_xgb=np.zeros(new_test.shape[0])
    for i,(tr,va) in enumerate(skf.split(new_train,y)):
        print('fold:',i+1,'training')
        dtrain = xgb.DMatrix(new_train[tr],y[tr])
        dvalid = xgb.DMatrix(new_train[va],y[va])
        watchlist = [(dtrain, 'train'), (dvalid, 'valid_data')]
        bst = xgb.train(dtrain=dtrain, num_boost_round=30000, evals=watchlist, early_stopping_rounds=200, \
        verbose_eval=50, params=xgb_params)
        oof_xgb[va] += bst.predict(xgb.DMatrix(new_train[va]), ntree_limit=bst.best_ntree_limit)
        prediction_xgb += bst.predict(xgb.DMatrix(new_test), ntree_limit=bst.best_ntree_limit)
    print('the roc_auc_score for train:',roc_auc_score(y,oof_xgb))
    prediction_xgb/=5
    return oof_xgb,prediction_xgb
def lgb_model(new_train,y,new_test):
    params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'num_leaves': 1000,
    'verbose': -1,
    'max_depth': -1,
  #  'reg_alpha':2.2,
  #  'reg_lambda':1.4,
    'seed':42,
    }
    #skf=StratifiedKFold(y,n_folds=5,shuffle=True,random_state=2018)
    skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    oof_lgb=np.zeros(new_train.shape[0]) ##用于存放训练集概率，由每折验证集所得
    prediction_lgb=np.zeros(new_test.shape[0])  ##用于存放测试集概率，k折最后要除以k取平均
    feature_importance_df = pd.DataFrame() ##存放特征重要性，此处不考虑
    for i,(tr,va) in enumerate(skf.split(new_train,y)):
        print('fold:',i+1,'training')
        dtrain = lgb.Dataset(new_train[tr],y[tr])
        dvalid = lgb.Dataset(new_train[va],y[va],reference=dtrain)
        ##训练：
        bst = lgb.train(params, dtrain, num_boost_round=30000, valid_sets=dvalid, verbose_eval=400,early_stopping_rounds=200)
        ##预测验证集：
        oof_lgb[va] += bst.predict(new_train[va], num_iteration=bst.best_iteration)
        ##预测测试集：
        prediction_lgb += bst.predict(new_test, num_iteration=bst.best_iteration)
        '''
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = list(new_train.columns)
        fold_importance_df["importance"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        '''
    
    print('the roc_auc_score for train:',roc_auc_score(y,oof_lgb)) ##线下auc评分
    prediction_lgb/=5
    return oof_lgb,prediction_lgb,feature_importance_df

##读入测试数据：
testdata= pd.read_csv("round1_iflyad_anticheat_testdata_feature.txt",sep='\t')
testdata['label']=-1 ##测试集没有标签，可标记为-1

testdata['begin_time']=testdata['sid'].apply(lambda x:int(x.split('-')[-1])) ##请求会话时间
testdata['nginxtime-begin_time']=testdata['nginxtime']-testdata['begin_time'] ##请求会话时间 与 请求到达服务时间的差

##读入训练数据：
traindata= pd.read_csv("round1_iflyad_anticheat_traindata.txt",sep='\t')

traindata['begin_time']=traindata['sid'].apply(lambda x:int(x.split('-')[-1]))
traindata['nginxtime-begin_time']=traindata['nginxtime']-traindata['begin_time']

##结合数据，方便提取特征：axis=0 纵向合并；axis=1 横向合并
data=pd.concat([traindata,testdata],axis=0).reset_index(drop=True)

print('the shape of data:',data.shpe)

print(data.nunique()) ##返回每个字段的所有值组成集合的大小，即集合元素个数
print(data[:5]) ##输出数据前5行
z=calculate_null(testdata,'sid','ver') ##计算缺失值的，下面还没用到

print('label distribution:\n',traindata['label'].value_counts()) ##查看训练集标签分布

object_cols=list(data.dtypes[data.dtypes==np.object].index) ##返回字段名为object类型的字段
print(data.dtypes[data.dtypes==np.object].index) ##输出object类型的字段

##本题所给时间戳为毫秒级，故需除以1000转换为秒级：时间戳转成日期格式
print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(data['nginxtime'][0]/1000)))

##对object类型的字段进行标签编码：
for col in object_cols:
    if col!='sid':
        data[col]=one_hot_col(data[col].astype(str)).transform(data[col].astype(str))

##划分数据：
train=data[:traindata.shape[0]]
label=train['label'].values
test=data[traindata.shape[0]:].reset_index(drop=True)

##模型训练预测：
oof_lgb,prediction_lgb,feature_importance_df=\
      lgb_model(np.array(train.drop(['sid','label','nginxtime','ip','reqrealip','begin_time'],axis=1)),\
                label,\
                np.array(test.drop(['sid','label','nginxtime','ip','reqrealip','begin_time'],axis=1)))

##保存结果：
sub=test[['sid']]
sub['label']=prediction_lgb
sub['label']=sub['label'].apply(lambda x: 1 if x>0.5 else 0) ##∪概率大于0.5的置1，否则置0
print('test pre_label distribution:\n',sub['label'].value_counts()) ## 模型预测测试集的标签分布
