In [36]:
import time
import sys
import os
import re
import gc
import datetime
import itertools
import pickle
import random
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from tqdm import *
from scipy import stats
import warnings
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold,train_test_split,StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,MinMaxScaler,StandardScaler
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, f1_score, log_loss,roc_auc_score,recall_score, precision_score
import seaborn as sns
color = sns.color_palette() 
sns.set(style="whitegrid")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
%matplotlib inline

In [37]:
# pip install lightgbm==2.11
lgb.__version__

'2.1.1'

In [38]:
def get_path(path):
    file_name = []
    for i in os.listdir(path):
        c = i.split('_')[0]
        file_name.append(c)
    file_name = list(set(file_name))
    return file_name
def lgb_recall_score(y_true, pred,t):
    pred[pred>=t] = 1
    pred[pred<t] = 0
    return recall_score(y_true, pred, average='binary')
def lgb_precision_score(y_true, pred, t):
    pred[pred>=t] = 1
    pred[pred<t] = 0
    return precision_score(y_true, pred, pos_label=0, average='binary')
def auc(y,pred):
    return roc_auc_score(y, pred)

def f1(y,pred):
    return f1_score(y, pred,average='macro')
def lgb_f1_score(y_pred, y_val):
    y_true = y_val.get_label()
    y_pred[y_pred>=0.1] = 1
    y_pred[y_pred<0.1] = 0
    return 'f1', f1_score(y_true, y_pred, average='macro'), True

In [39]:
train_po_path = '../data/Motor_tain/Positive/'
train_ne_path = '../data/Motor_tain/Negative/'
train_po_file = get_path(train_po_path)
train_ne_file = get_path(train_ne_path)
test_path = '../data/Motor_testP/'
test_file_name = get_path(test_path)

In [40]:
"""基础特征"""
def get_data(file_name,file_path):
    fe = defaultdict(list)
    for file in tqdm(file_name):
        data_b = pd.read_csv(file_path + file + '_B.csv')
        data_f = pd.read_csv(file_path + file +  '_F.csv')
        fe['idx'].append(file)
        
        fe['ai1_max_b'].append(data_b.ai1.max())
        fe['ai1_min_b'].append(data_b.ai1.min())
        fe['ai1_mean_b'].append(data_b.ai1.mean())
        fe['ai1_median_b'].append(data_b.ai1.median())
        fe['ai1_mode_b'].append(data_b.ai1.value_counts().index[0])
        fe['ai1_std_b'].append(data_b.ai1.std())
        fe['ai1_kurt_b'].append(data_b.ai1.kurt())
        
        fe['ai2_max_b'].append(data_b.ai2.max())
        fe['ai2_min_b'].append(data_b.ai2.min())
        fe['ai2_mean_b'].append(data_b.ai2.mean())
        fe['ai2_median_b'].append(data_b.ai2.median())
        fe['ai2_mode_b'].append(data_b.ai2.value_counts().index[0])
        fe['ai2_std_b'].append(data_b.ai2.std())
        fe['ai2_kurt_b'].append(data_b.ai2.kurt())
        
        fe['ai1_max_f'].append(data_f.ai1.max())
        fe['ai1_min_f'].append(data_f.ai1.min())
        fe['ai1_mean_f'].append(data_f.ai1.mean())
        fe['ai1_median_f'].append(data_f.ai1.median())
        fe['ai1_mode_f'].append(data_f.ai1.value_counts().index[0])
        fe['ai1_std_f'].append(data_f.ai1.std())
        fe['ai1_kurt_f'].append(data_f.ai1.kurt())
        
        fe['ai2_max_f'].append(data_f.ai2.max())
        fe['ai2_min_f'].append(data_f.ai2.min())
        fe['ai2_mean_f'].append(data_f.ai2.mean())
        fe['ai2_median_f'].append(data_f.ai2.median())
        fe['ai2_mode_f'].append(data_f.ai2.value_counts().index[0])
        fe['ai2_std_f'].append(data_f.ai2.std())
        fe['ai2_kurt_f'].append(data_f.ai2.kurt())
    return pd.DataFrame(fe)
train = get_data(train_ne_file,train_ne_path)
train['result'] = 0
train_po = get_data(train_po_file,train_po_path)
train_po['result'] = 1
train = train.append(train_po).reset_index(drop=True)
train.to_csv('../data/train_jichu1.csv', index=False)
test = get_data(test_file_name,test_path)
test.to_csv('../data/test_jichu1.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:03<00:00,  7.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:03<00:00,  7.61it/s]
 33%|█████████████████████████▉                                                    | 1904/5738 [05:54<13:05,  4.88it/s]

KeyboardInterrupt: 

In [47]:
train = pd.read_csv('../data/train_jichu.csv')
test = pd.read_csv('../data/test_jichu1.csv')
for df in [train,test]:
    df['std_b'] = df['ai1_std_b'] / df['ai2_std_b']
    df['std_f'] = df['ai1_std_f'] / df['ai2_std_f']
    df['ai1_ptp_b'] = df['ai1_max_b'] / df['ai1_min_b']
    df['ai2_ptp_b'] = df['ai2_max_b'] / df['ai2_min_b']
    df['ai1_ptp_f'] = df['ai1_max_f'] / df['ai1_min_f']
    df['ai2_ptp_f'] = df['ai2_max_f'] / df['ai2_min_f']
#     df['std_ai1_bf'] = df['ai1_std_b'] / df['ai1_std_f']
#     df['std_ai2_bf'] = df['ai2_std_b'] / df['ai2_std_f']
    df['ratio_b'] = df['ai1_ptp_b'] / df['ai2_ptp_b']
    df['area_b'] = df['ai1_ptp_b'] * df['ai2_ptp_b']
#     df['ratio_f'] = df['ai1_ptp_f'] / df['ai2_ptp_f']
#     df['area_f'] = df['ai1_ptp_f'] * df['ai2_ptp_f']
#     """std/median"""

In [12]:
train[train['ai1_median_b']<train['ai1_median_b'].quantile(.3)]['result'].sum()

0

In [34]:
cc = test[(test['ai1_median_b']<test['ai1_median_b'].quantile(.5))&(test['ai2_median_b']<test['ai2_median_b'].quantile(.5))]['idx'].values

In [35]:
# sub = pd.read_csv('../S1_best_score.csv')
sub[sub['idx'].isin(cc)]['result'].sum()

1

In [8]:
test[(test['ai1_ptp_f']>test['ai1_ptp_f'].quantile(0.65))|(test['ai1_mean_f']<test['ai1_mean_f'].quantile(0.5))]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [73]:
train.columns

Index(['idx', 'ai1_max_b', 'ai1_min_b', 'ai1_mean_b', 'ai1_median_b',
       'ai1_mode_b', 'ai1_std_b', 'ai1_kurt_b', 'ai2_max_b', 'ai2_min_b',
       'ai2_mean_b', 'ai2_median_b', 'ai2_mode_b', 'ai2_std_b', 'ai2_kurt_b',
       'ai1_max_f', 'ai1_min_f', 'ai1_mean_f', 'ai1_median_f', 'ai1_mode_f',
       'ai1_std_f', 'ai1_kurt_f', 'ai2_max_f', 'ai2_min_f', 'ai2_mean_f',
       'ai2_median_f', 'ai2_mode_f', 'ai2_std_f', 'ai2_kurt_f', 'result',
       'std_b', 'std_f', 'ai1_ptp_b', 'ai2_ptp_b', 'ai1_ptp_f', 'ai2_ptp_f',
       'ratio_b', 'area_b'],
      dtype='object')

In [74]:
# plt.figure(figsize=(12,12))
# corr = train.corr() # 计算协方差 ,相关分析,皮尔逊相关系数
# sns.heatmap(corr, xticklabels = corr.columns.values, 
#             yticklabels = corr.columns.values,annot=True) # 画热力图 
# plt.show() # plt.show()

In [48]:
# col_fe = [i for i in train.columns if i not in ['idx', 'result']+['ai2_std_b']]
col_fe = [i for i in train.columns if i not in ['idx', 'result']]
# col_fe = col+col2+col3+col1
X_train = train[col_fe].copy()
y_train = train['result'].copy()
X_test = test[col_fe].copy()

In [76]:
# scale = MinMaxScaler()
# scale.fit(X_train)
# X_train = pd.DataFrame(scale.transform(X_train),columns=X_train.columns)
# X_test = pd.DataFrame(scale.transform(X_test),columns=X_test.columns)

In [49]:
K = 5
seed = 996#2019,666
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)
lgb_params = {
                        'boosting_type': 'gbdt',
                        'objective': 'binary',
                        'metric': 'binary_logloss',#binary_logloss
                        'is_unbalance': True,
                        'max_depth': 5,#3
                        'bagging_freq': 2,#5,3,2
#                         'bagging_seed':1,
#                         'lambda_l2': 2,
                        'lambda_l1': 0.6,#1，0.2
                        'subsample': 0.68,#0.7,0.8
                        'colsample_bytree': 0.58,#0.5,0.7
                        'learning_rate': 0.08,#0.02,0.1
                        'seed': 2017,
                        'nthread': 6,
#                         'silent': True
             }
"""5,2,0.6,0.7,0.58,0.1,2017,6:4200"""
"""5,2,0.6,0.68,0.58,0.08,2017,6:4200,+9"""
"""5,2,0.6,0.85,0.08,2017,6:4400，+7"""
"""5,3,1,0.6,0.9,0.08,2017,6:4400，+8"""

'5,3,1,0.6,0.9,0.08,2017,6:4400，+8'

In [50]:
%%time
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
feature_importance_df = pd.DataFrame()
score = []
prec_score = []
min_p = []
for i, (train_index, val_index) in enumerate(skf.split(X_train,y_train)):
    print("fold {}".format(i+1))
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    lgb_train = lgb.Dataset(X_tr,y_tr)
    lgb_val = lgb.Dataset(X_val,y_val)
    num_round = 56#56
    clf = lgb.train(lgb_params, lgb_train, num_round, valid_sets = [lgb_train, lgb_val],
                     verbose_eval=250, early_stopping_rounds = 10#10
                   )
    oof[val_index] = clf.predict(X_val, num_iteration=clf.best_iteration)
    print('best iteration = ',clf.best_iteration)
    
#     clf = AdaBoostClassifier(n_estimators=10, learning_rate=0.1, random_state=10)
#     clf = AdaBoostClassifier(n_estimators=20, learning_rate=0.1, random_state=10)
#     clf.fit(X_tr,y_tr)
#     oof[val_index] = clf.predict_proba(X_val)[:, 1]
    
#     r_score = lgb_recall_score(y_val.values, clf.predict_proba(X_val)[:, 1],0.0045)#0.23
#     p_score = lgb_precision_score(y_val.values, clf.predict_proba(X_val)[:, 1],0.0045)
    r_score = lgb_recall_score(y_val.values, oof[val_index],0.02312)#,0.23,0.0045,0.023,0.02312
    p_score = lgb_precision_score(y_val.values, oof[val_index],0.02312)
    score.append(r_score)
    prec_score.append(p_score)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = clf.feature_name() #col_fe
    fold_importance_df["importance"] = clf.feature_importance() #clf.feature_importances_
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#     predictions += clf.predict_proba(X_test)[:, 1] / skf.n_splits
    predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / skf.n_splits
print('recall_score : ', score)
print('precision_score : ', prec_score)

fold 1
Training until validation scores don't improve for 10 rounds.
Did not meet early stopping. Best iteration is:
[56]	training's binary_logloss: 0.0302382	valid_1's binary_logloss: 0.0553004
best iteration =  56
fold 2
Training until validation scores don't improve for 10 rounds.
Did not meet early stopping. Best iteration is:
[56]	training's binary_logloss: 0.0353457	valid_1's binary_logloss: 0.0590651
best iteration =  56
fold 3
Training until validation scores don't improve for 10 rounds.
Did not meet early stopping. Best iteration is:
[56]	training's binary_logloss: 0.0230814	valid_1's binary_logloss: 0.0964808
best iteration =  56
fold 4
Training until validation scores don't improve for 10 rounds.
Did not meet early stopping. Best iteration is:
[56]	training's binary_logloss: 0.033219	valid_1's binary_logloss: 0.0362833
best iteration =  56
fold 5
Training until validation scores don't improve for 10 rounds.
Did not meet early stopping. Best iteration is:
[56]	training's bina

In [51]:
"""lgbb_4200,lgb_"""
sub = test[['idx']].copy()
sub['result'] = predictions
sub = sub.sort_values('result').reset_index(drop=True)
sub.loc[:4200,'result'] = 0   #lgb  2000:0.3626,3000:0.5439,3200:0.5799,3500:0.6343,3750:0
sub.loc[4200:,'result'] = 1
sub['result'] = sub['result'].astype(int)
# sub.to_csv('../sub/jichu_lgb4200.csv', index=False)
print(sub['result'].value_counts())
rr = pd.read_csv('../sub/aaa.csv')
rr3 = rr[rr['result'] == 0]['idx'].tolist()
print(len([i for i in sub[sub['result'] == 0]['idx'].tolist() if i not in rr3]))
len(rr3)

0    4200
1    1538
Name: result, dtype: int64
205


4100

In [52]:
catjj = pd.read_csv('../sub/S1_best_score.csv')
cat3 = catjj[catjj['result'] == 1]['idx'].tolist()
cc = sub[sub['idx'].isin(cat3)]['result']
cc[cc==0].count()

0

In [19]:
feature_importance_df.groupby('Feature')['importance'].mean().sort_values(ascending=False).reset_index()

Unnamed: 0,Feature,importance
0,ai2_mean_f,18.8
1,ai2_mean_b,15.8
2,ai1_max_f,10.8
3,ai1_ptp_f,10.2
4,ai1_kurt_f,8.0
5,ai1_mean_b,5.6
6,std_b,5.6
7,ai2_median_b,5.6
8,ai2_median_f,5.4
9,ai1_min_f,4.2


In [1031]:
# train['std_b'].plot.kde()
# test['std_b'].plot.kde()

In [52]:
col_fe = ['ai1_kurt_f', 'ai1_ptp_f',
 'ai1_kurt_b', 'ai2_mean_f', 'ai1_mean_b', 'ai1_max_f',
 'ai2_kurt_b', 'ai2_std_f', 'ai2_std_b', 'ai1_min_f',
 'ai1_std_b', 'ai1_mean_f', 'std_f', 'ai2_mode_b']
X_train = train[col_fe].copy()
y_train = train['result'].copy()
X_test = test[col_fe].copy()

In [53]:
"""1226,18,0.1:4000"""
"""222,21,0.1"""
K = 5
seed = 1226
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
feature_importance_df = pd.DataFrame()
score = []
prec_score = []
min_p = []
for i, (train_index, val_index) in enumerate(skf.split(X_train,y_train)):
    print("fold {}".format(i))
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    clf = AdaBoostClassifier(n_estimators=23, learning_rate=0.1)
    clf.fit(X_tr,y_tr)
    oof[val_index] = clf.predict_proba(X_val)[:, 1]
    r_score = lgb_recall_score(y_val.values, clf.predict_proba(X_val)[:, 1],0.0045)#0.23
#     p_score = lgb_precision_score(y_val.values, clf.predict_proba(X_val)[:, 1],0.0045)
    score.append(r_score)
#     prec_score.append(p_score)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = col_fe
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    predictions += clf.predict_proba(X_test)[:, 1]
print('recall_score : ', score)
# print('precision_score : ', prec_score)

fold 0
fold 1
fold 2
fold 3
fold 4
recall_score :  [1.0, 1.0, 1.0, 1.0, 1.0]


In [56]:
"""lgbb_4200,lgb_"""
sub = test[['idx']].copy()
sub['result'] = predictions
sub = sub.sort_values('result').reset_index(drop=True)
sub.loc[:4200,'result'] = 0   #lgb  2000:0.3626,3000:0.5439,3200:0.5799,3500:0.6343,3750:0
sub.loc[4200:,'result'] = 1
sub['result'] = sub['result'].astype(int)
# sub.to_csv('../sub/jichu_ada4200.csv', index=False)
print(sub['result'].value_counts())

0    4200
1    1538
Name: result, dtype: int64


In [57]:
catjj = pd.read_csv('../sub/5385.csv')
cat3 = catjj[catjj['result'] == 1]['idx'].tolist()
cc = sub[sub['idx'].isin(cat3)]['result']
cc[cc==0].count()

2

In [101]:
feature_importance_df.groupby('Feature')['importance'].mean().sort_values(ascending=False).reset_index()

Unnamed: 0,Feature,importance
0,ai1_kurt_f,0.321739
1,ai1_ptp_f,0.217391
2,ai1_kurt_b,0.147826
3,ai2_mean_f,0.095652
4,ai2_std_b,0.06087
5,ai2_kurt_b,0.034783
6,ai1_std_b,0.034783
7,ai1_mean_b,0.034783
8,ai1_max_f,0.026087
9,ai2_std_f,0.008696


In [102]:
import xgboost as xgb

In [103]:
train = pd.read_csv('../data/train_jichu.csv')
test = pd.read_csv('../data/test_jichu.csv')
for df in [train,test]:
    df['std_b'] = df['ai1_std_b'] / df['ai2_std_b']
    df['std_f'] = df['ai1_std_f'] / df['ai2_std_f']
    df['ai1_ptp_b'] = df['ai1_max_b'] / df['ai1_min_b']
    df['ai2_ptp_b'] = df['ai2_max_b'] / df['ai2_min_b']
    df['ai1_ptp_f'] = df['ai1_max_f'] / df['ai1_min_f']
    df['ai2_ptp_f'] = df['ai2_max_f'] / df['ai2_min_f']
#     df['std_ai1_bf'] = df['ai1_std_b'] / df['ai1_std_f']
#     df['std_ai2_bf'] = df['ai2_std_b'] / df['ai2_std_f']
    df['ratio_b'] = df['ai1_ptp_b'] / df['ai2_ptp_b']
    df['area_b'] = df['ai1_ptp_b'] * df['ai2_ptp_b']
#     df['ratio_f'] = df['ai1_ptp_f'] / df['ai2_ptp_f']
#     df['area_f'] = df['ai1_ptp_f'] * df['ai2_ptp_f']

In [58]:
col = [i for i in train.columns if i not in ['idx', 'result']]
X_train = train[col].copy()
y_train = train['result'].copy().astype(int)
X_test = test[col].copy()
K = 5
seed = 211#2021
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)

In [59]:
%%time
xgb_pred_te_all = 0
xgb_auc_mean = 0
xgb_auc_mean2 = 0
f1 = []
score = []
prec_score = []
oof_xgb = np.zeros(len(X_train))
for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):
    
    y_tr, y_val = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()
    X_tr, X_val= X_train.iloc[train_index,:].copy(), X_train.iloc[test_index,:].copy()
    print( "\nFold ", i+1)

    xgb_tr = xgb.DMatrix(X_tr, y_tr)
    xgb_val = xgb.DMatrix(X_val, y_val)
    xgb_te = xgb.DMatrix(X_test)
    xgb_params = {
                      "objective": 'reg:logistic',
                      "booster" : "gbtree",
#                       "eta": 0.1,
                      "max_depth": 3,#6
                      "subsample": 0.8,#0.85
                      'eval_metric':'auc',#logloss
#                       "colsample_bytree": 0.7,#0.7
                      "colsample_bylevel":0.6,#0.8
                      'tree_method':'hist',#exact
                      'alpha':0.02,                         
                      "thread":6,
                      'silent':True,
                      "seed": 111
                  }
    watchlist = [(xgb_val, 'eval')]
    xgb_model =xgb.train(xgb_params,
                 xgb_tr,
                 num_boost_round = 60,#100
                 evals =watchlist,
                 verbose_eval=50,
                 early_stopping_rounds=60#30
                        )

    pred = xgb_model.predict(xgb_val, ntree_limit=xgb_model.best_ntree_limit)
    oof_xgb[test_index] = xgb_model.predict(xgb_val, ntree_limit=xgb_model.best_ntree_limit)
#     print( " auc_model = ", xgb_model.best_score )
    r_score = lgb_recall_score(y_val.values, xgb_model.predict(xgb_val, ntree_limit=xgb_model.best_ntree_limit),0.08)#0.23
#     p_score = lgb_precision_score(y_val.values, xgb_model.predict(xgb_val, ntree_limit=xgb_model.best_ntree_limit),0.08)
    score.append(r_score)
#     prec_score.append(p_score)
    pred_te = xgb_model.predict(xgb_te, ntree_limit=xgb_model.best_ntree_limit)
    xgb_pred_te_all = xgb_pred_te_all + pred_te / K
print('recall_score : ', score)
# print('precision_score : ', prec_score)


Fold  1
[01:06:42] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	eval-auc:0.821667
Will train until eval-auc hasn't improved in 60 rounds.
[50]	eval-auc:0.995
[59]	eval-auc:0.995

Fold  2
[01:06:42] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	eval-auc:0.891667
Will train until eval-auc hasn't improved in 60 rounds.
[50]	eval-auc:1
[59]	eval-auc:1

Fold  3
[01:06:43] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	eval-auc:0.878333
Will train until eval-auc hasn't improved in 60 rounds.
[50]	eval-auc:1
[59]	eval-auc:1

Fold  4
[01:06:43] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	eval-auc:0.911667
Will train until eval-auc hasn't improved in 60 rounds.
[50]	eval-auc:0.993333
[59]	eval-auc:0.993333

Fold  5
[01:06:43] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]

In [63]:
"""3640—3600"""
sub = test[['idx']].copy()
sub['result'] = xgb_pred_te_all
print(sub['result'].value_counts().head())
sub = sub.sort_values('result').reset_index(drop=True)
sub.loc[:3640,'result'] = 0
sub.loc[3640:,'result'] = 1
sub['result'] = sub['result'].astype(int)
# sub.to_csv('../sub/jichu_xgb3600.csv', index=False)
catjj = pd.read_csv('../sub/S1_best_score.csv')#cat,0.66
cat3 = catjj[catjj['result'] == 1]['idx'].tolist()
cc = sub[sub['idx'].isin(cat3)]['result']
cc[cc==0].count()

0.042726    418
0.042696    281
0.043920    180
0.042749    159
0.043954    152
Name: result, dtype: int64


1

In [52]:
# xgb.__version__(0.72)
# fe_map  = xgb_model.get_fscore()
# feature = pd.DataFrame(list(xgb_model.get_fscore()))[0]
# values = pd.DataFrame(list(xgb_model.get_fscore()))[0].map(fe_map)
# fe_values = pd.DataFrame(feature)
# fe_values['value'] = values
# fe_values = fe_values.rename(columns={0:'feature'}).sort_values(by = 'value',ascending=False).reset_index(drop = True)
# fe_values