In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
import time
import matplotlib.pyplot as plt
import math

In [2]:
train = pd.read_csv('/Users/liuchang/Data/ent_data/train.csv')
entbase = pd.read_csv('/Users/liuchang/Data/ent_data/1entbase.csv')
alter = pd.read_csv('/Users/liuchang/Data/ent_data/2alter.csv')
branch = pd.read_csv('/Users/liuchang/Data/ent_data/3branch.csv')
invest = pd.read_csv('/Users/liuchang/Data/ent_data/4invest.csv')
right = pd.read_csv('/Users/liuchang/Data/ent_data/5right.csv')
project = pd.read_csv('/Users/liuchang/Data/ent_data/6project.csv')
lawsuit = pd.read_csv('/Users/liuchang/Data/ent_data/7lawsuit.csv')
breakfaith = pd.read_csv('/Users/liuchang/Data/ent_data/8breakfaith.csv')
recruit = pd.read_csv('/Users/liuchang/Data/ent_data/9recruit.csv')
qualification = pd.read_csv('/Users/liuchang/Data/ent_data/10qualification.csv', encoding='gbk')
test = pd.read_csv('/Users/liuchang/Data/ent_data/evaluation_public.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def translate_date(date):
    year = int(date[:4])
    month = int(date[-2:])
    return (year - 2010) * 12 + month

In [33]:
def get_interaction_feature(df, feature_A, feature_B):
    feature_A_list = sorted(df[feature_A].unique())
    feature_B_list = sorted(df[feature_B].unique())
    count = 0
    mydict = {}
    for i in feature_A_list:
        mydict[int(i)] = {}
        for j in feature_B_list:
            mydict[int(i)][int(j)] = count
            count += 1
    return df.apply(lambda x: mydict[int(x[feature_A])][int(x[feature_B])], axis=1)

In [34]:
def get_entbase_feature(df):
    df = df.copy()
    mydf = df.fillna(value={'HY':0, 'ZCZB':0, 'MPNUM': 0, 'INUM': 0, 'ENUM': 0, 'FINZB': 0, 'FSTINUM': 0, 'TZINUM': 0})
    mydf['allnum'] = mydf['MPNUM'] + mydf['INUM'] + mydf['MPNUM'] + mydf['TZINUM'] + mydf['ENUM'] 
    mydf['zczb/rgyear'] = mydf['ZCZB'] / mydf['RGYEAR']
    mydf['rgyear_zczb'] = get_interaction_feature(mydf, "RGYEAR", "ZCZB")
    mydf['rgyear_finzb'] = get_interaction_feature(mydf, 'RGYEAR', 'FINZB')
    mydf['rgyear_inum'] = get_interaction_feature(mydf, 'RGYEAR', 'INUM')
    mydf['rgyear_enum'] = get_interaction_feature(mydf, 'RGYEAR', 'ENUM')
    mydf['rgyear_fstinum'] = get_interaction_feature(mydf, 'RGYEAR', 'FSTINUM')
    mydf['rgyear_tzinum'] = get_interaction_feature(mydf, 'RGYEAR', 'TZINUM')
    mydf['rgyear_mpnum'] = get_interaction_feature(mydf, 'RGYEAR', 'MPNUM')
    mydf['zczb_rgyear'] = get_interaction_feature(mydf, 'ZCZB', 'RGYEAR')
    mydf['zczb_finzb'] = get_interaction_feature(mydf, 'ZCZB', 'FINZB')
    mydf['zczb_inum'] = get_interaction_feature(mydf, 'ZCZB', 'INUM')
    mydf['zczb_enum'] = get_interaction_feature(mydf, 'ZCZB', 'ENUM')
    mydf['zczb_fstinum'] = get_interaction_feature(mydf, 'ZCZB', 'FSTINUM')
    mydf['zczb_tzinum'] = get_interaction_feature(mydf, 'ZCZB', 'TZINUM')
    mydf['zczb_mpnum'] = get_interaction_feature(mydf, 'ZCZB', 'MPNUM')
    
    return mydf 
    

In [35]:
def get_alter_feature(df):
    df = df.copy()
    alt_no = df.groupby(['EID', 'ALTERNO']).size().reset_index()
    alt_no = alt_no.groupby('EID')[0].agg([sum, len]).reset_index()
    alt_no.columns = ['EID', 'alt_count', 'alt_types_count']
    alt_no_oh = df.groupby(['EID', 'ALTERNO']).size().unstack().reset_index()
    alt_no_oh.columns = [i if i == 'EID' else 'alt_' + i for i in alt_no_oh.columns]
    
    df['date'] = df['ALTDATE'].apply(translate_date)
    date = df.groupby('EID')['date'].agg([min, max, np.ptp, np.std]).reset_index()
    date.columns = ['EID', 'alt_date_min', 'alt_date_max', 'alt_date_ptp', 'alt_date_std']

    df['altbe'] = df['ALTBE'].str.extract('(\d+\.?\d*)').astype(float)
    df['altaf'] = df['ALTAF'].str.extract('(\d+\.?\d*)').astype(float)
    alt_be_af = df.groupby('EID')['altbe', 'altaf'].agg([min, max, np.mean]).reset_index()
    alt_be_af.columns = ['EID', 'alt_be_min', 'alt_be_max', 'alt_be_mean', 'alt_af_min', 'alt_af_max', 'alt_af_mean']
    

    df['alt_be_af_gap'] = df['altaf'] - df['altbe']
    alt_be_af_gap = df.groupby('EID')['alt_be_af_gap'].agg([min, max, np.mean, np.ptp, np.std]).reset_index()
    alt_be_af_gap.columns = [i if i == 'EID' else 'alt_be_af_gap_' + i for i in alt_be_af_gap.columns]
    
    alt_1year = df[df['ALTDATE'] >= '2015-01'].groupby('EID').size().reset_index()
    alt_1year.columns = ['EID', 'alt_num(1year)']
    alt_2year = df[df['ALTDATE'] >= '2014-01'].groupby('EID').size().reset_index()
    alt_2year.columns = ['EID', 'alt_num(2year)']
    alt_3year = df[df['ALTDATE'] >= '2013-01'].groupby('EID').size().reset_index()
    alt_3year.columns = ['EID', 'alt_num(3year)']
    
    mydf = pd.merge(alt_no, alt_no_oh, how='left', on='EID')
    mydf = pd.merge(mydf, date, how='left', on='EID')
    mydf = pd.merge(mydf, alt_be_af, how='left', on='EID')
    mydf = pd.merge(mydf, alt_be_af_gap, how='left', on='EID')
    mydf = pd.merge(mydf, alt_1year, how='left', on='EID')
    mydf = pd.merge(mydf, alt_2year, how='left', on='EID')
    mydf = pd.merge(mydf, alt_3year, how='left', on='EID')
    
    return mydf

In [50]:
def get_right_feature(df):
    
    df = df.copy()
    rig_type = df.groupby(['EID', 'RIGHTTYPE']).size().reset_index()
    rig_type = rig_type.groupby('EID')[0].agg([sum, len]).reset_index()
    rig_type.columns = ['EID', 'rig_count', 'rig_types_count']
    
    rig_type_oh_rate = df.groupby(['EID', 'RIGHTTYPE']).size().unstack().reset_index()
    rig_type_oh_rate.iloc[:, 1:] = rig_type_oh_rate.iloc[:, 1:].div(rig_type['rig_count'], axis='index')
    rig_type_oh_rate.columns = [i if i == 'EID' else 'rig_rate_' + str(i) for i in rig_type_oh_rate.columns]
    
    df['ask_month'] = (pd.to_datetime(df['ASKDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    ask_date = df.groupby('EID')['ask_month'].agg([max, min, np.ptp, np.std]).reset_index()
    ask_date.columns = ['EID', 'rig_askdate_max', 'rig_askdate_min', 'rig_askdate_ptp', 'rig_askdate_std']
    
    df['get_month'] = (pd.to_datetime(df['FBDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    get_date = df.groupby('EID')['get_month'].agg([max, min, np.ptp, np.std]).reset_index()
    get_date.columns = ['EID', 'rig_getdate_max', 'rig_getdate_min', 'rig_getdate_ptp', 'rig_getdate_std']
    
    
    unget = df[df.FBDATE.isnull()]
    unget = unget.groupby('EID').size().reset_index()
    unget.columns = ['EID', 'rig_unget_num']
    
    right_1year = df[df['ASKDATE'] >= '2015-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_1year.columns = ['EID', 'ask_num(1year)']
    right_2year = df[df['ASKDATE'] >= '2014-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_2year.columns = ['EID', 'ask_num(2year)']
    right_5year = df[df['ASKDATE'] >= '2010-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_5year.columns = ['EID', 'ask_num(5year)']
    right_end_1year = df[df['FBDATE'] >= '2015-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_1year.columns = ['EID', 'get_num(1year)']
    right_end_2year = df[df['FBDATE'] >= '2014-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_2year.columns = ['EID', 'get_num(2year)']
    right_end_5year = df[df['FBDATE'] >= '2010-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_5year.columns = ['EID', 'get_num(5year)']
    
    df['ask_get_month_gap'] = df['get_month'] - df['ask_month']
    ask_get_month_gap = df.groupby('EID')['ask_get_month_gap'].agg([max, min, np.ptp, np.mean, np.std]).reset_index()
    ask_get_month_gap.columns = [i if i == 'EID' else 'ask_get_month_gap_' + i for i in ask_get_month_gap.columns]

    mydf = pd.merge(rig_type, rig_type_oh_rate, how='left', on='EID')
    mydf = pd.merge(mydf, ask_date, how='left', on='EID')
    mydf = pd.merge(mydf, get_date, how='left', on='EID')
    mydf = pd.merge(mydf, unget, how='left', on='EID')
    mydf = pd.merge(mydf, right_1year, how='left', on='EID')
    mydf = pd.merge(mydf, right_2year, how='left', on='EID')
    mydf = pd.merge(mydf, right_5year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_1year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_2year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_5year, how='left', on='EID')
    mydf = pd.merge(mydf, ask_get_month_gap, how='left', on='EID')
    
    mydf['ask_rate(1year)'] = mydf['ask_num(1year)'] / mydf['rig_count']
    mydf['ask_rate(2year)'] = mydf['ask_num(2year)'] / mydf['rig_count']
    mydf['ask_rate(5year)'] = mydf['ask_num(5year)'] / mydf['rig_count']
    mydf['get_rate(1year)'] = mydf['get_num(1year)'] / mydf['rig_count']
    mydf['get_rate(2year)'] = mydf['get_num(2year)'] / mydf['rig_count']
    mydf['get_rate(5year)'] = mydf['get_num(5year)'] / mydf['rig_count']
    
    return mydf
    

In [51]:
def get_recruit_feature(df):
    
    df = df.copy()
    rec_wz = df.groupby(['EID', 'WZCODE']).size().reset_index()
    rec_wz = rec_wz.groupby('EID')[0].agg([sum, len]).reset_index()
    rec_wz.columns = ['EID', 'rec_wz_count', 'rec_wz_types_count']
    
    rec_wz_oh = df.groupby(['EID', 'WZCODE']).size().unstack().reset_index()
    rec_wz_oh.columns = [i if i == 'EID' else 'rec_wz_' + i for i in rec_wz_oh.columns]
    
    rec_pos = df.groupby(['EID', 'POSCODE']).size().reset_index()
    rec_pos = rec_pos.groupby('EID')[0].agg([sum, len]).reset_index()
    rec_pos.columns = ['EID', 'rec_pos_count', 'rec_pos_types_count']
    
    df['recdate'] = (pd.to_datetime(df['RECDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    rec_date = df.groupby('EID')['recdate'].agg([max, min, np.ptp, np.std]).reset_index()
    rec_date.columns = ['EID', 'rec_date_max', 'rec_date_min', 'rec_date_ptp', 'rec_date_std']
    
    df['pnum'] = df['PNUM'].str.extract('(\d+)').fillna(1).astype(int)  # 若干=1
    rec_num = df.groupby('EID')['pnum'].agg([sum, max, min, np.ptp, np.std]).reset_index()
    rec_num.columns = ['EID' if i == 'EID' else 'rec_num_' + i for i in rec_num.columns]
    
    rec_count = df.groupby('EID').size().reset_index()
    rec_count.columns = ['EID', 'rec_count']
    
    mydf = pd.merge(rec_wz, rec_wz_oh, how='left', on='EID')
    mydf = pd.merge(mydf, rec_pos, how='left', on='EID')
    mydf = pd.merge(mydf, rec_date, how='left', on='EID')
    mydf = pd.merge(mydf, rec_num, how='left', on='EID')
    mydf = pd.merge(mydf, rec_count, how='left', on='EID')
    
    return mydf

In [52]:
def get_branch_feature(df):
    df = df.copy()
    bra_num = df.groupby('EID')['TYPECODE'].size().reset_index()
    bra_num.columns = ['EID', 'bra_count']
    
    bra_home = df.groupby(['EID', 'IFHOME']).size().unstack().reset_index()
    bra_home.columns = ['EID', 'bra_nothome', 'bra_home']
    
    bra_year = df.groupby('EID')['B_REYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    bra_year.columns = [i if i == 'EID' else 'bra_year_' + i for i in bra_year.columns]
    
    bra_endyear = df.groupby('EID')['B_ENDYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    bra_endyear.columns = [i if i == 'EID' else 'bra_endyear_' + i for i in bra_endyear.columns]
    
    bra_end_num = df[~df['B_ENDYEAR'].isnull()].groupby('EID').size().reset_index()
    bra_end_num.columns = ['EID', 'bra_end_num']
    bra_notend_num = df[df['B_ENDYEAR'].isnull()].groupby('EID').size().reset_index()
    bra_notend_num.columns = ['EID', 'bra_notend_num']
    
    df['bra_begin_end_gap'] = df['B_ENDYEAR'] - df['B_REYEAR']
    bra_begin_end_gap = df.groupby('EID')['bra_begin_end_gap'].agg([min, max, np.ptp, np.mean, np.std]).reset_index()
    bra_begin_end_gap.columns = [i if i == 'EID' else 'bra_begin_end_gap_' + i for i in bra_begin_end_gap.columns]
    
    mydf = pd.merge(bra_num, bra_home, how='left', on='EID')
    mydf = pd.merge(mydf, bra_year, how='left', on='EID')
    mydf = pd.merge(mydf, bra_endyear, how='left', on='EID')
    mydf = pd.merge(mydf, bra_notend_num, how='left', on='EID')
    mydf = pd.merge(mydf, bra_begin_end_gap, how='left', on='EID')
    
    return mydf
    

In [53]:
def get_invest_feature(df):
    df = df.copy()
    inv_num = df.groupby('EID').size().reset_index()
    inv_num.columns = ['EID', 'inv_count']
    
    inv_home = df.groupby(['EID', 'IFHOME']).size().unstack().reset_index()
    inv_home.columns = ['EID', 'inv_nothome_num', 'inv_home_num']
    
    inv_bl = df.groupby('EID')['BTBL'].agg([sum, min, max, np.ptp, np.std]).reset_index()
    inv_bl.columns = [i if i == 'EID' else 'inv_bl_' + i for i in inv_bl.columns]
    
    inv_year = df.groupby('EID')['BTYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    inv_year.columns = [i if i == 'EID' else 'inv_year_' + i for i in inv_year.columns]
    
    inv_endyear = df.groupby('EID')['BTENDYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    inv_endyear.columns = [i if i == 'EID' else 'inv_endyear_' + i for i in inv_endyear.columns]
    
    inved_num = df.groupby('BTEID').size().reset_index()
    inved_num.columns = ['EID', 'inved_num']
    
    inved_home = df.groupby(['BTEID', 'IFHOME']).size().unstack().reset_index()
    inved_home.columns = ['EID', 'inved_nothome_num', 'inved_home_num']
    
    inved_bl = df.groupby('BTEID')['BTBL'].agg([sum, min, max, np.ptp, np.std]).reset_index()
    inved_bl.columns = ['EID' if i == 'BTEID' else 'inved_bl_' + i for i in inved_bl.columns]
    
    inved_year = df.groupby('BTEID')['BTYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    inved_year.columns = ['EID' if i == 'BTEID' else 'inved_year_' + i for i in inved_year.columns]
    
    inved_endyear = df.groupby('BTEID')['BTENDYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    inved_endyear.columns = ['EID' if i == 'BTEID' else 'inved_endyear_' + i for i in inved_endyear.columns]
    
    df['inv_begin_end_gap'] = df['BTENDYEAR'] - df['BTYEAR']
    inv_begin_end_gap = df.groupby('EID')['inv_begin_end_gap'].agg([min, max, np.ptp, np.mean, np.std]).reset_index()
    
    inv_begin_end_gap.columns = [i if i == 'EID' else 'inv_begin_end_gap_' + i for i in inv_begin_end_gap.columns]
    
    mydf = pd.merge(inv_num, inv_home, how='left', on='EID')
    mydf = pd.merge(mydf, inv_bl, how='left', on='EID')
    mydf = pd.merge(mydf, inv_year, how='left', on='EID')
    mydf = pd.merge(mydf, inv_endyear, how='left', on='EID')
    mydf = pd.merge(mydf, inved_num, how='left', on='EID')
    mydf = pd.merge(mydf, inved_bl, how='left', on='EID')
    mydf = pd.merge(mydf, inved_year, how='left', on='EID')
    mydf = pd.merge(mydf, inved_endyear, how='left', on='EID')
    mydf = pd.merge(mydf, inv_begin_end_gap, how='left', on='EID')
    
    return mydf


In [40]:
def get_lawsuit_feature(df):
    df = df.copy()
    law_num = df.groupby('EID').size().reset_index()
    law_num.columns = ['EID', 'law_count']
    
    df['lawdate'] = df['LAWDATE'].apply(lambda x: x.replace('年', '-').replace('月', '')).apply(translate_date)
    law_date = df.groupby('EID')['lawdate'].agg([min, max, np.ptp, np.std]).reset_index()
    law_date.columns = [i if i == 'EID' else 'law_date_' + i for i in law_date.columns]
    
    law_amout = df.groupby('EID')['LAWAMOUNT'].agg([sum, min, max, np.mean, np.ptp, np.std]).reset_index()
    law_amout.columns = [i if i == 'EID' else 'law_amout_' + i for i in law_amout.columns]
    
    mydf = pd.merge(law_num, law_date, how='left', on='EID')
    mydf = pd.merge(mydf, law_amout, how='left', on='EID')
    
    return mydf
    

In [41]:
def get_project_feature(df):
    df = df.copy()
    pro_num = df.groupby('EID').size().reset_index()
    pro_num.columns = ['EID', 'pro_count']
    
    df['djdate'] = df['DJDATE'].apply(translate_date)
    pro_date = df.groupby('EID')['djdate'].agg([min, max, np.ptp, np.std]).reset_index()
    pro_date.columns = [i if i == 'EID' else 'pro_date_' + i for i in pro_date.columns]
    
    pro_home = df.groupby(['EID', 'IFHOME']).size().unstack().reset_index()
    pro_home.columns = ['EID', 'pro_nothome_num', 'pro_home_num']
    
    mydf = pd.merge(pro_num, pro_date, how='left', on='EID')
    mydf = pd.merge(mydf, pro_home, how='left', on='EID')
    
    return mydf

In [42]:
def get_qualification_feature(df):
    df = df.copy()
    qua_num = df.groupby('EID').size().reset_index()
    qua_num.columns = ['EID', 'qua_count']
    
    qua_type = df.groupby(['EID', 'ADDTYPE']).size().unstack().reset_index()
    qua_type.columns = [i if i == 'EID' else 'qua_type_' + str(i) for i in qua_type.columns]
    
    df['begindate'] = df['BEGINDATE'].apply(lambda x: x.replace(u'年', '-').replace(u'月', '')).apply(translate_date)
    qua_begindate = df.groupby('EID')['begindate'].agg([min, max, np.ptp, np.std]).reset_index()
    qua_begindate.columns = [i if i == 'EID' else 'qua_begindate_' + i for i in qua_begindate.columns]
    
    df['expirydate'] = df['EXPIRYDATE'].apply(lambda x: x.replace(u'年', '-').replace(u'月', '') if not pd.isnull(x) else np.nan)
    df['expirydate'] = (pd.to_datetime(df['expirydate']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    qua_expirydate = df.groupby('EID')['expirydate'].agg([min, max, np.ptp, np.std]).reset_index()
    qua_expirydate.columns = [i if i == 'EID' else 'qua_expirydate_' + i for i in qua_expirydate.columns]
    
    df['qua_begin_expiry_gap'] = df['expirydate'] - df['begindate']
    qua_begin_expiry_gap = df.groupby('EID')['qua_begin_expiry_gap'].agg([min, max, np.ptp, np.mean, np.std]).reset_index()
    qua_begin_expiry_gap.columns = [i if i == 'EID' else 'qua_begin_expiry_gap_' + i for i in qua_begin_expiry_gap.columns]
    
    mydf = pd.merge(qua_num, qua_type, how='left', on='EID')
    mydf = pd.merge(mydf, qua_begindate, how='left', on='EID')
    mydf = pd.merge(mydf, qua_expirydate, how='left', on='EID')
    mydf = pd.merge(mydf, qua_begin_expiry_gap, how='left', on='EID')
    
    return mydf

In [43]:
def get_breakfaith_feature(df):
    df = df.copy()
    bre_num = df.groupby('EID').size().reset_index()
    bre_num.columns = ['EID', 'bre_count']
    df['fbdate'] = df['FBDATE'].apply(lambda x: x.replace('年', '-').replace('月', '')).apply(translate_date)
    bre_date = df.groupby('EID')['fbdate'].agg([min, max, np.ptp, np.std]).reset_index()
    bre_date.columns = [i if i == 'EID' else 'bre_date_' + i for i in bre_date.columns]
    
    df['sxenddate'] = (pd.to_datetime(df['SXENDDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    bre_enddate = df.groupby('EID')['sxenddate'].agg([min, max, np.ptp, np.std]).reset_index()
    bre_enddate.columns = [i if i == 'EID' else 'bre_enddate_' + i for i in bre_enddate.columns]
    
    df['bre_begin_end_gap'] = df['sxenddate'] - df['fbdate']
    bre_begin_end_gap = df.groupby('EID')['bre_begin_end_gap'].agg([min, max, np.ptp, np.mean, np.std]).reset_index()
    bre_begin_end_gap.columns = [i if i == 'EID' else 'bre_begin_end_gap_' + i for i in bre_begin_end_gap.columns]
    
    mydf = pd.merge(bre_num, bre_date, how='left', on='EID')
    mydf = pd.merge(mydf, bre_enddate, how='left', on='EID')
    mydf = pd.merge(mydf, bre_begin_end_gap, how='left', on='EID')
    
    return mydf

In [44]:
entbase_feat = get_entbase_feature(entbase)

In [46]:
entbase_feat

Unnamed: 0,PROV,EID,RGYEAR,HY,ZCZB,ETYPE,MPNUM,INUM,ENUM,FINZB,...,rgyear_fstinum,rgyear_tzinum,rgyear_mpnum,zczb_rgyear,zczb_finzb,zczb_inum,zczb_enum,zczb_fstinum,zczb_tzinum,zczb_mpnum
0,11,p13836,1980,51.0,480.0,13,25.0,1.0,1.0,98041.0000,...,15921,704,7225,6560,605658,9601,3841,47729,2112,21625
1,11,p49137,1993,74.0,30.0,13,0.0,1.0,1.0,570.0000,...,22368,990,10125,3513,319893,5101,2041,25350,1122,11475
2,11,p108756,2015,69.0,1000.0,6,2.0,1.0,1.0,1000.0000,...,33299,1474,15077,10131,926922,14801,5921,73556,3256,33302
3,11,p7320,2005,74.0,240.0,5,0.0,2.0,2.0,1500.0000,...,28329,1254,12825,4953,451886,7202,2882,35784,1584,16200
4,11,p194070,2014,51.0,100000.0,3,3.0,2.0,2.0,1000.0000,...,32809,1452,14853,53378,4905102,78402,31362,389655,17248,176403
5,11,p120116,2009,51.0,800.0,6,3.0,1.0,1.0,5000.0000,...,30321,1342,13728,8765,803134,12801,5121,63620,2816,28803
6,11,p197225,2011,75.0,3000.0,6,4.0,8.0,1.0,10500.0000,...,31324,1386,14179,11487,1054232,16808,6721,83509,3696,37804
7,11,p60345,1995,54.0,14200.0,5,0.0,2.0,2.0,11280.0000,...,23364,1034,10575,19087,1754881,28002,11202,139165,6160,63000
8,11,p124661,2002,48.0,2800.0,7,0.0,22.0,4.0,100.0000,...,26838,1188,12150,11342,1038619,16622,6644,82502,3652,37350
9,11,p107376,2004,51.0,50.0,6,0.0,2.0,2.0,100.0000,...,27832,1232,12600,3660,331804,5302,2122,26341,1166,11925


In [47]:
alter_feat = get_alter_feature(alter)

  del sys.path[0]
  


In [48]:
alter_feat

Unnamed: 0,EID,alt_count,alt_types_count,alt_01,alt_02,alt_03,alt_04,alt_05,alt_10,alt_12,...,alt_af_max,alt_af_mean,alt_be_af_gap_min,alt_be_af_gap_max,alt_be_af_gap_mean,alt_be_af_gap_ptp,alt_be_af_gap_std,alt_num(1year),alt_num(2year),alt_num(3year)
0,p10000,1,1,,1.0,,,,,,...,,,,,,,,1.0,1.0,1
1,p100004,2,2,,,,,1.0,,,...,100.0,100.0,90.0,90.0,90.0,0.0,,2.0,2.0,2
2,p100007,1,1,,,,,,,1.0,...,,,,,,,,,1.0,1
3,p100010,4,4,,,,1.0,1.0,1.0,1.0,...,10.0,10.0,7.0,7.0,7.0,0.0,,,4.0,4
4,p100016,1,1,,,,,1.0,,,...,118.0,118.0,108.0,108.0,108.0,0.0,,1.0,1.0,1
5,p100018,1,1,,,,,,1.0,,...,,,,,,,,1.0,1.0,1
6,p100022,3,3,,,,,1.0,,1.0,...,100.0,100.0,97.0,97.0,97.0,0.0,,1.0,3.0,3
7,p100031,3,3,,,,,1.0,1.0,,...,228.0,228.0,178.0,178.0,178.0,0.0,,2.0,3.0,3
8,p100034,1,1,,,,,,,,...,,,,,,,,,1.0,1
9,p100043,1,1,,,,,1.0,,,...,100.0,100.0,97.0,97.0,97.0,0.0,,1.0,1.0,1


In [54]:
right_feature = get_right_feature(right)

In [55]:
right_feature

Unnamed: 0,EID,rig_count,rig_types_count,rig_rate_11,rig_rate_12,rig_rate_20,rig_rate_30,rig_rate_40,rig_rate_50,rig_rate_60,...,ask_get_month_gap_min,ask_get_month_gap_ptp,ask_get_month_gap_mean,ask_get_month_gap_std,ask_rate(1year),ask_rate(2year),ask_rate(5year),get_rate(1year),get_rate(2year),get_rate(5year)
0,p100000,9,2,,,,,0.777778,,0.222222,...,0.0,16.0,9.500000,7.422937,0.111111,0.444444,1.000000,0.333333,0.444444,0.666667
1,p100001,6,1,,,,,,,1.000000,...,0.0,0.0,0.000000,0.000000,,,1.000000,,,1.000000
2,p10001,3,1,,,,,,,1.000000,...,0.0,0.0,0.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3,p100014,1,1,,,,,1.000000,,,...,21.0,0.0,21.000000,,,,1.000000,,1.000000,1.000000
4,p100022,9,2,,,,,0.444444,,0.555556,...,0.0,14.0,6.111111,7.253352,,1.000000,1.000000,0.444444,1.000000,1.000000
5,p100028,4,2,,,,,0.750000,,0.250000,...,0.0,16.0,10.000000,8.717798,,1.000000,1.000000,0.500000,0.750000,0.750000
6,p10003,1,1,,,,,,,1.000000,...,0.0,0.0,0.000000,,,,1.000000,,,1.000000
7,p100031,1,1,,,,,,,1.000000,...,0.0,0.0,0.000000,,,,1.000000,,,1.000000
8,p100036,1,1,,,,,,,1.000000,...,0.0,0.0,0.000000,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
9,p100038,2,2,,,,,0.500000,,0.500000,...,0.0,14.0,7.000000,9.899495,,,1.000000,,,1.000000


In [56]:
recruit_feat = get_recruit_feature(recruit)



In [57]:
recruit_feat

Unnamed: 0,EID,rec_wz_count,rec_wz_types_count,rec_wz_zp01,rec_wz_zp02,rec_wz_zp03,rec_pos_count,rec_pos_types_count,rec_date_max,rec_date_min,rec_date_ptp,rec_date_std,rec_num_sum,rec_num_max,rec_num_min,rec_num_ptp,rec_num_std,rec_count
0,p100004,9,2,,3.0,6.0,9,9,67,59,8,4.216370,32,6,1,5,2.403701,9
1,p100008,9,2,4.0,,5.0,9,9,67,57,10,5.019407,9,1,1,0,0.000000,9
2,p100032,4,1,,,4.0,4,4,54,54,0,0.000000,6,2,1,1,0.577350,4
3,p100037,11,3,1.0,3.0,7.0,11,11,63,52,11,4.358899,27,10,1,9,2.659460,11
4,p100038,4,2,,1.0,3.0,4,4,66,58,8,4.618802,4,1,1,0,0.000000,4
5,p100049,3,1,,3.0,,3,3,59,59,0,0.000000,3,1,1,0,0.000000,3
6,p100057,38,3,6.0,28.0,4.0,38,38,64,54,10,3.039451,78,21,1,20,3.195837,38
7,p100072,4,1,,4.0,,4,4,66,66,0,0.000000,6,2,1,1,0.577350,4
8,p100087,5,2,,2.0,3.0,5,5,67,56,11,6.024948,5,1,1,0,0.000000,5
9,p100089,1,1,,1.0,,1,1,65,65,0,,10,10,10,0,,1


In [58]:
branch_feat = get_branch_feature(branch)

In [59]:
branch_feat

Unnamed: 0,EID,bra_count,bra_nothome,bra_home,bra_year_min,bra_year_max,bra_year_ptp,bra_year_std,bra_endyear_min,bra_endyear_max,bra_endyear_ptp,bra_endyear_std,bra_notend_num,bra_begin_end_gap_min,bra_begin_end_gap_max,bra_begin_end_gap_ptp,bra_begin_end_gap_mean,bra_begin_end_gap_std
0,p100035,1,,1.0,2014,2014,0,,,,,,1.0,,,,,
1,p100037,1,,1.0,2013,2013,0,,,,,,1.0,,,,,
2,p100118,2,,2.0,2008,2008,0,0.000000,2011.0,2011.0,0.0,,1.0,3.0,3.0,0.0,3.000000,
3,p100164,19,,19.0,2010,2015,5,1.887168,2015.0,2015.0,0.0,,18.0,4.0,4.0,0.0,4.000000,
4,p100187,7,,7.0,2013,2014,1,0.534522,,,,,7.0,,,,,
5,p100205,1,,1.0,2008,2008,0,,,,,,1.0,,,,,
6,p100206,2,,2.0,2012,2015,3,2.121320,,,,,2.0,,,,,
7,p100230,1,,1.0,2006,2006,0,,2012.0,2012.0,0.0,,,6.0,6.0,0.0,6.000000,
8,p100266,1,,1.0,2008,2008,0,,2011.0,2011.0,0.0,,,3.0,3.0,0.0,3.000000,
9,p100304,1,,1.0,2009,2009,0,,2009.0,2009.0,0.0,,,0.0,0.0,0.0,0.000000,


In [60]:
invest_feat = get_invest_feature(invest)

In [61]:
invest_feat

Unnamed: 0,EID,inv_count,inv_nothome_num,inv_home_num,inv_bl_sum,inv_bl_min,inv_bl_max,inv_bl_ptp,inv_bl_std,inv_year_min,...,inved_year_std,inved_endyear_min,inved_endyear_max,inved_endyear_ptp,inved_endyear_std,inv_begin_end_gap_min,inv_begin_end_gap_max,inv_begin_end_gap_ptp,inv_begin_end_gap_mean,inv_begin_end_gap_std
0,p100164,4,,4.0,4.000,1.000,1.000,0.000,0.000000,2015,...,,,,,,,,,,
1,p100184,1,,1.0,0.033,0.033,0.033,0.000,,2013,...,,,,,,,,,,
2,p10019,5,2.0,3.0,4.100,0.500,1.000,0.500,0.248998,2014,...,,,,,,,,,,
3,p100206,2,,2.0,1.341,0.341,1.000,0.659,0.465983,2012,...,,,,,,0.0,0.0,0.0,0.0,
4,p100217,1,,1.0,0.225,0.225,0.225,0.000,,2015,...,,,,,,,,,,
5,p100220,3,,3.0,0.887,0.007,0.490,0.483,0.254944,2002,...,,,,,,4.0,4.0,0.0,4.0,
6,p100234,2,,2.0,1.000,0.500,0.500,0.000,0.000000,1992,...,,,,,,,,,,
7,p100271,1,,1.0,0.150,0.150,0.150,0.000,,2011,...,,,,,,,,,,
8,p100283,3,,3.0,1.537,0.510,0.517,0.007,0.004041,2011,...,,,,,,,,,,
9,p100286,1,,1.0,1.000,1.000,1.000,0.000,,2012,...,,,,,,,,,,


In [62]:
lawsuit_feat = get_lawsuit_feature(lawsuit)

In [63]:
lawsuit_feat

Unnamed: 0,EID,law_count,law_date_min,law_date_max,law_date_ptp,law_date_std,law_amout_sum,law_amout_min,law_amout_max,law_amout_mean,law_amout_ptp,law_amout_std
0,p100022,2,56,56,0,0.000000,122000,61000,61000,6.100000e+04,0,0.000000e+00
1,p100048,2,60,60,0,0.000000,22000,11000,11000,1.100000e+04,0,0.000000e+00
2,p100074,3,63,66,3,1.527525,8646200,17100,5509600,2.882067e+06,5492500,2.753937e+06
3,p100084,2,49,49,0,0.000000,1488400,744200,744200,7.442000e+05,0,0.000000e+00
4,p100107,2,44,44,0,0.000000,2,1,1,1.000000e+00,0,0.000000e+00
5,p100129,1,33,33,0,,118900,118900,118900,1.189000e+05,0,
6,p100165,1,63,63,0,,51100,51100,51100,5.110000e+04,0,
7,p100186,1,67,67,0,,26700,26700,26700,2.670000e+04,0,
8,p100253,11,35,67,32,9.400193,3002800,0,3000000,2.729818e+05,3000000,9.044500e+05
9,p100270,1,63,63,0,,5361700,5361700,5361700,5.361700e+06,0,


In [64]:
project_feat = get_project_feature(project)

In [65]:
project_feat

Unnamed: 0,EID,pro_count,pro_date_min,pro_date_max,pro_date_ptp,pro_date_std,pro_nothome_num,pro_home_num
0,p100098,1,67,67,0,,,1.0
1,p100187,1,68,68,0,,,1.0
2,p100314,2,63,66,3,2.121320,,2.0
3,p100435,1,59,59,0,,,1.0
4,p100450,6,50,68,18,5.958188,,6.0
5,p100538,11,46,67,21,6.772941,,11.0
6,p100548,1,63,63,0,,,1.0
7,p100588,1,55,55,0,,,1.0
8,p100847,1,61,61,0,,,1.0
9,p100916,4,45,59,14,6.683313,,4.0


In [66]:
qualification_feat = get_qualification_feature(qualification)

In [67]:
qualification_feat

Unnamed: 0,EID,qua_count,qua_type_1,qua_type_2,qua_type_3,qua_begindate_min,qua_begindate_max,qua_begindate_ptp,qua_begindate_std,qua_expirydate_min,qua_expirydate_max,qua_expirydate_ptp,qua_expirydate_std,qua_begin_expiry_gap_min,qua_begin_expiry_gap_max,qua_begin_expiry_gap_ptp,qua_begin_expiry_gap_mean,qua_begin_expiry_gap_std
0,p100533,1,1.0,,,52,52,0,,127.0,127.0,0.0,,75.0,75.0,0.0,75.000000,
1,p100847,1,1.0,,,-17,-17,0,,52.0,52.0,0.0,,69.0,69.0,0.0,69.000000,
2,p10086,1,1.0,,,31,31,0,,100.0,100.0,0.0,,69.0,69.0,0.0,69.000000,
3,p100916,4,,1.0,3.0,-7,41,48,20.188693,42.0,89.0,47.0,23.500000,26.0,49.0,23.0,39.500000,11.030261
4,p101264,1,,1.0,,58,58,0,,93.0,93.0,0.0,,35.0,35.0,0.0,35.000000,
5,p101379,2,2.0,,,-14,55,69,48.790368,49.0,124.0,75.0,53.033009,63.0,69.0,6.0,66.000000,4.242641
6,p101513,1,1.0,,,55,55,0,,124.0,124.0,0.0,,69.0,69.0,0.0,69.000000,
7,p101669,1,1.0,,,22,22,0,,85.0,85.0,0.0,,63.0,63.0,0.0,63.000000,
8,p101805,3,1.0,2.0,,11,47,36,18.036999,83.0,100.0,17.0,12.020815,36.0,69.0,33.0,52.500000,23.334524
9,p101903,2,2.0,,,-26,46,72,50.911688,37.0,109.0,72.0,50.911688,63.0,63.0,0.0,63.000000,0.000000


In [68]:
breakfaith_feat = get_breakfaith_feature(breakfaith)

In [70]:
breakfaith_feat

Unnamed: 0,EID,bre_count,bre_date_min,bre_date_max,bre_date_ptp,bre_date_std,bre_enddate_min,bre_enddate_max,bre_enddate_ptp,bre_enddate_std,bre_begin_end_gap_min,bre_begin_end_gap_max,bre_begin_end_gap_ptp,bre_begin_end_gap_mean,bre_begin_end_gap_std
0,p100048,1,65,65,0,,66.0,66.0,0.0,,1.0,1.0,0.0,1.0,
1,p100344,8,65,67,2,0.886405,,,,,,,,,
2,p100347,10,62,66,4,2.065591,,,,,,,,,
3,p100667,3,67,67,0,0.000000,,,,,,,,,
4,p101171,2,65,65,0,0.000000,,,,,,,,,
5,p101362,2,65,65,0,0.000000,,,,,,,,,
6,p101672,2,64,64,0,0.000000,,,,,,,,,
7,p101996,2,65,65,0,0.000000,,,,,,,,,
8,p102778,1,68,68,0,,,,,,,,,,
9,p102837,2,53,53,0,0.000000,,,,,,,,,


In [71]:
dataset = pd.merge(entbase_feat, alter_feat, on='EID', how='left')
dataset = pd.merge(dataset, right_feature, on='EID', how='left')
dataset = pd.merge(dataset, recruit_feat, on='EID', how='left')
dataset = pd.merge(dataset, branch_feat, on='EID', how='left')
dataset = pd.merge(dataset, invest_feat, on='EID', how='left')
dataset = pd.merge(dataset, lawsuit_feat, on='EID', how='left')
dataset = pd.merge(dataset, project_feat, on='EID', how='left')
dataset = pd.merge(dataset, qualification_feat, on='EID', how='left')
dataset = pd.merge(dataset, breakfaith_feat, on='EID', how='left')

In [72]:
dataset['alt_count/rgyear'] = dataset['alt_count'] / dataset['RGYEAR']
dataset['rig_count/rgyear'] = dataset['rig_count'] / dataset['RGYEAR']
dataset['rec_count/rgyear'] = dataset['rec_count'] / dataset['RGYEAR']
dataset['bra_count/rgyear'] = dataset['bra_count'] / dataset['RGYEAR']
dataset['inv_count/rgyear'] = dataset['inv_count'] / dataset['RGYEAR']
dataset['law_count/rgyear'] = dataset['law_count'] / dataset['RGYEAR']
dataset['pro_count/rgyear'] = dataset['pro_count'] / dataset['RGYEAR']
dataset['qua_count/rgyear'] = dataset['qua_count'] / dataset['RGYEAR']
dataset['bre_count/rgyear'] = dataset['bre_count'] / dataset['RGYEAR']
dataset['alt_num(1year)/rgyear'] = dataset['alt_num(1year)'] / dataset['RGYEAR']
dataset['alt_num(2year)/rgyear'] = dataset['alt_num(2year)'] / dataset['RGYEAR']
dataset['alt_num(3year)/rgyear'] = dataset['alt_num(3year)'] / dataset['RGYEAR']

In [75]:
dataset['MPNUM_CLASS'] = dataset['INUM'].apply(lambda x : x if x <= 4 else 5)
dataset['FSTINUM_CLASS'] = dataset['FSTINUM'].apply(lambda x : x if x <= 6 else 7)
dataset.fillna(value={'alt_count': 0, 'rig_count': 0}, inplace=True)
for column in ['MPNUM', 'INUM', 'FINZB', 'FSTINUM', 'TZINUM', 'ENUM', 'ZCZB', 'allnum', 'RGYEAR', 'alt_count', 'rig_count']:
    groupby_list = [['HY'], ['ETYPE'], ['HY', 'ETYPE'], ['HY', 'PROV'], ['ETYPE', 'PROV'], ['MPNUM_CLASS'], ['FSTINUM_CLASS']]
    for groupby in groupby_list:
        if 'MPNUM_CLASS' in groupby and column == 'MPNUM':
            continue
        if 'FSTINUM_CLASS' in groupby and column == 'FSTINUM':
            continue
        groupby_keylist = []
        for key in groupby:
            groupby_keylist.append(dataset[key])
        tmp = dataset[column].groupby(groupby_keylist).agg([sum, min, max, np.mean]).reset_index()
        tmp = pd.merge(dataset, tmp, on=groupby, how='left')
        dataset['ent_' + column.lower() + '-mean_gb_' + '_'.join(groupby).lower()] = dataset[column] - tmp['mean']
        dataset['ent_' + column.lower() + '-min_gb_' + '_'.join(groupby).lower()] = dataset[column] - tmp['min']
        dataset['ent_' + column.lower() + '-max_gb_' + '_'.join(groupby).lower()] = dataset[column] - tmp['max']
        dataset['ent_' + column.lower() + '/sum_gb_' + '_'.join(groupby).lower()] = dataset[column] / tmp['sum']
dataset.drop(['MPNUM_CLASS', 'FSTINUM_CLASS'], axis=1, inplace=True)

In [76]:
dataset = dataset.join(pd.get_dummies(dataset['PROV'], prefix='prov'))
dataset = dataset.join(pd.get_dummies(dataset['HY'], prefix='hy').mul(dataset['ZCZB'], axis=0))
dataset = dataset.join(pd.get_dummies(dataset['ETYPE'], prefix='etype').mul(dataset['RGYEAR'], axis=0))
dataset.drop(['PROV', 'HY', 'ETYPE'], axis=1, inplace=True)

In [77]:
trainset = pd.merge(train, dataset, on='EID', how='left')
testset = pd.merge(test, dataset, on='EID', how='left')

In [79]:
train_feature = trainset.drop(['TARGET', 'ENDDATE'], axis=1)
train_label = trainset.TARGET.values
test_feature = testset
test_index = testset.EID.values
print(train_feature.shape, train_label.shape, test_feature.shape)

(218264, 631) (218264,) (218247, 631)


In [80]:
train_feature

Unnamed: 0,EID,RGYEAR,ZCZB,MPNUM,INUM,ENUM,FINZB,FSTINUM,TZINUM,allnum,...,etype_2,etype_3,etype_4,etype_5,etype_6,etype_7,etype_8,etype_13,etype_16,etype_17
0,p1,2013,10000.0,6.0,3.0,3.0,15000.0000,7.0,1.0,19.0,...,0,0,0,0,2013,0,0,0,0,0
1,p100000,2012,10.0,0.0,1.0,0.0,0.0000,0.0,0.0,1.0,...,0,0,0,0,0,2012,0,0,0,0
2,p100002,2010,10.0,0.0,2.0,0.0,0.0000,0.0,0.0,2.0,...,0,0,0,0,0,2010,0,0,0,0
3,p100004,2013,100.0,1.0,2.0,0.0,0.0000,0.0,0.0,4.0,...,0,0,0,0,0,2013,0,0,0,0
4,p100005,2009,3.0,1.0,2.0,0.0,0.0000,1.0,0.0,4.0,...,0,0,0,0,0,2009,0,0,0,0
5,p100007,2011,10.0,0.0,2.0,0.0,0.0000,0.0,0.0,2.0,...,0,0,0,0,0,2011,0,0,0,0
6,p100008,2013,10.0,1.0,2.0,0.0,0.0000,1.0,0.0,4.0,...,0,0,0,0,0,2013,0,0,0,0
7,p100012,2009,100.0,0.0,1.0,0.0,0.0000,0.0,0.0,1.0,...,0,0,0,0,0,2009,0,0,0,0
8,p100015,2015,50.0,0.0,1.0,0.0,0.0000,0.0,0.0,1.0,...,0,0,0,0,0,2015,0,0,0,0
9,p100016,2009,120.0,0.0,2.0,0.0,0.0000,0.0,0.0,2.0,...,0,0,0,0,0,2009,0,0,0,0


In [94]:
train_feature

Unnamed: 0,EID,RGYEAR,ZCZB,MPNUM,INUM,ENUM,FINZB,FSTINUM,TZINUM,allnum,...,etype_2,etype_3,etype_4,etype_5,etype_6,etype_7,etype_8,etype_13,etype_16,etype_17
0,1,2013,10000.0,6.0,3.0,3.0,15000.0000,7.0,1.0,19.0,...,0,0,0,0,2013,0,0,0,0,0
1,100000,2012,10.0,0.0,1.0,0.0,0.0000,0.0,0.0,1.0,...,0,0,0,0,0,2012,0,0,0,0
2,100002,2010,10.0,0.0,2.0,0.0,0.0000,0.0,0.0,2.0,...,0,0,0,0,0,2010,0,0,0,0
3,100004,2013,100.0,1.0,2.0,0.0,0.0000,0.0,0.0,4.0,...,0,0,0,0,0,2013,0,0,0,0
4,100005,2009,3.0,1.0,2.0,0.0,0.0000,1.0,0.0,4.0,...,0,0,0,0,0,2009,0,0,0,0
5,100007,2011,10.0,0.0,2.0,0.0,0.0000,0.0,0.0,2.0,...,0,0,0,0,0,2011,0,0,0,0
6,100008,2013,10.0,1.0,2.0,0.0,0.0000,1.0,0.0,4.0,...,0,0,0,0,0,2013,0,0,0,0
7,100012,2009,100.0,0.0,1.0,0.0,0.0000,0.0,0.0,1.0,...,0,0,0,0,0,2009,0,0,0,0
8,100015,2015,50.0,0.0,1.0,0.0,0.0000,0.0,0.0,1.0,...,0,0,0,0,0,2015,0,0,0,0
9,100016,2009,120.0,0.0,2.0,0.0,0.0000,0.0,0.0,2.0,...,0,0,0,0,0,2009,0,0,0,0


In [95]:
config = {
    'rounds': 10000,
    'folds': 5
}
params = {
    'booster': 'gbtree',
    'objective':'binary:logistic',
    'stratified':True,
    'scale_pos_weights': 0,
    'max_depth': 10,
    'min_child_weight': 15,
    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'eta': 0.01,
    'seed':42,
    'silent':1,
    'eval_metric':'auc'
}

In [96]:
def xgb_cv(train_feature, train_label, params, folds, rounds):
    start = time.clock()
    print(train_feature.columns)
    params['scale_pos_weights'] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    num_round = rounds
    print('run cv: ' + 'round: ' + str(rounds))
    res = xgb.cv(params, dtrain, num_round, nfold=folds, verbose_eval=10, early_stopping_rounds=100)
    elapsed = (time.clock() - start)
    print('Time used: ', elapsed, 's')
    return len(res), res.loc[len(res)-1, 'test-auc-mean']
    
    

In [97]:
def xgb_predict(train_feature, train_label, test_feature, rounds, params):
    params['scale_pos_weights'] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    dtest = xgb.DMatrix(test_feature, label=np.zeros(test_feature.shape[0]))
    watchlist = [(dtrain, 'train')]
    num_round = rounds
    model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval=30)
    predict = model.predict(dtest)
    return model, predict


In [98]:
def store_result(test_index, pred, threshold, name):
    result = pd.DataFrame({'EID': test_index, 'FORTARGET': 0, 'PROB': pred})
    mask = result['PROB'] >= threshold
    result.at[mask, 'FORTARGET'] = 1
    result.to_csv('../data/output/sub/' + name + '.csv', index=0)
    return result

In [None]:
iterations, best_score = xgb_cv(train_feature, train_label, params, config['folds'], config['rounds'])

In [None]:
model, pred = xgb_predict(train_feature, train_label, test_feature, iterations, params)

In [None]:
importance = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
importance.to_csv('../data/output/feat_imp/importance-1207-%f(r%d).csv' % (best_score, iterations), index=False)
res = store_result(test_index, pred, 0.18, '1207-xgb-%f(r%d)' % (best_score, iterations))