In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score,f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import gc

import warnings
warnings.simplefilter('ignore')

In [2]:
train = pd.read_csv('./data/比赛训练集.csv',encoding='gbk')
test = pd.read_csv('./data/比赛测试集.csv',encoding='gbk')
sub = pd.read_csv('./data/提交示例.csv')
data = pd.concat([train,test]).reset_index(drop = True)


## -------对类别特征 One-Hot编码
data['糖尿病家族史'] = data['糖尿病家族史'].apply(
    lambda x:'叔叔或姑姑有一方患有糖尿病' if x=='叔叔或者姑姑有一方患有糖尿病' else x)
df = pd.get_dummies(data['糖尿病家族史']).astype('int')
data = pd.concat([data,df],axis = 1)

## -------对值为0的4个特征值替换为np.nan
for i in ['口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度','体重指数']:
#     data[i] = data[i].apply(lambda x:np.nan if x<=0 else x)
    data[i] = data[i].apply(lambda x:np.nan if x==0 else x)

In [3]:
train = data[data['患有糖尿病标识'].notnull()].reset_index(drop = True)
test = data[~data['患有糖尿病标识'].notnull()].reset_index(drop = True)
feas = [i  for i in train.columns.tolist() if i not in ['编号','糖尿病家族史','患有糖尿病标识',]]

x_train = train[feas]
y_train = train['患有糖尿病标识']
x_test = test[feas]

In [4]:
x_train.shape

(5070, 10)

In [5]:
THR = 0.4 #f1阈值
folds = 10 
seed = 2021
def lgb_model(train_x, train_y, test_x):
    
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    f1_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('----------------------------------- {} -----------------------------------'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]


        train_matrix = lgb.Dataset(trn_x, label=trn_y)
        valid_matrix = lgb.Dataset(val_x, label=val_y)
        fea = pd.DataFrame()

        params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'tree_learner':'serial',
                'metric': 'auc',
                'min_child_weight': 6,
                'num_leaves': 2 ** 6,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.01,
                'seed': 2021,
                'nthread': 28,
                'n_jobs':4,
                'silent': True,
                'verbose': -1,
                'device':'gpu'
            }

        model = lgb.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix], 
                          categorical_feature =[] ,verbose_eval=200,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x:x[1], reverse=True))[:30])

            
        train[valid_index] = val_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        f1_scores.append(f1_score(val_y,[1 if i>THR else 0 for i in val_pred]))
        
        
        print(cv_scores)
        print(f1_scores)
    test = sum(test_pre) / folds
    print(f"s_scotrainre_list:  {cv_scores}")
    print(f"s_auc_score_mean:  {np.mean(cv_scores)}")
    print(f"s_f1_score_mean:  {np.mean(f1_scores)}")
    print(f"s_score_std:  {np.std(cv_scores)}")

    return train, test, Feass


lgb_train, lgb_test ,Feass= lgb_model(x_train, y_train, x_test)

----------------------------------- 1 -----------------------------------
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.996668	valid_1's auc: 0.991931
[400]	training's auc: 0.99716	valid_1's auc: 0.992112
Early stopping, best iteration is:
[320]	training's auc: 0.997005	valid_1's auc: 0.992591
[('口服耐糖量测试', 4159), ('体重指数', 3771), ('舒张压', 2689), ('出生年份', 2344), ('胰岛素释放实验', 1383), ('肱三头肌皮褶厚度', 1369), ('性别', 590), ('无记录', 433), ('父母有一方患有糖尿病', 208), ('叔叔或姑姑有一方患有糖尿病', 124)]
[0.9925910035972411]
[0.9295039164490861]
----------------------------------- 2 -----------------------------------
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.996898	valid_1's auc: 0.984555
[400]	training's auc: 0.997337	valid_1's auc: 0.988317
[600]	training's auc: 0.997934	valid_1's auc: 0.987888
Early stopping, best iteration is:
[404]	training's auc: 0.997346	valid_1's auc: 0.988845
[('口服耐糖量测试', 4459), ('体重指数', 4096), ('舒张压', 2901), ('出

[200]	training's auc: 0.996418	valid_1's auc: 0.993561
[400]	training's auc: 0.996993	valid_1's auc: 0.994697
[600]	training's auc: 0.997684	valid_1's auc: 0.994845
[800]	training's auc: 0.998277	valid_1's auc: 0.994961
[1000]	training's auc: 0.998696	valid_1's auc: 0.995027
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.998696	valid_1's auc: 0.995027
[('体重指数', 6397), ('口服耐糖量测试', 6170), ('舒张压', 4683), ('出生年份', 4306), ('肱三头肌皮褶厚度', 2664), ('胰岛素释放实验', 2247), ('性别', 933), ('无记录', 798), ('父母有一方患有糖尿病', 322), ('叔叔或姑姑有一方患有糖尿病', 234)]
[0.9925910035972411, 0.9888452526319262, 0.9938120854097225, 0.9949176594831854, 0.9916998781331313, 0.9965580843845723, 0.9909917328151248, 0.9938407825829189, 0.9921939330061592, 0.9950265142781858]
[0.9295039164490861, 0.9360613810741689, 0.9460154241645244, 0.9540816326530611, 0.93734335839599, 0.9538461538461539, 0.9462915601023019, 0.949748743718593, 0.9282051282051282, 0.9556135770234987]
s_scotrainre_list:  [0.9925910035972411, 0.

In [10]:
pd.Series([1 if i >0.4 else 0 for i in lgb_test]).value_counts()

0    598
1    402
dtype: int64

In [11]:
sub['label'] = [1 if i >0.4 else 0 for i in lgb_test]
sub.to_csv('./submit/base_9434_407.csv',index =False)

### 伪标签

In [6]:
print(len(x_test))
x_test["患有糖尿病标识"] = lgb_test.tolist()

def label_test_fc(x):
    if x>0.9:
        return 1
    if x<0.2:
        return 0
    else:
        return 2
x_test['患有糖尿病标识'] = x_test['患有糖尿病标识'].apply(label_test_fc)
label_test = x_test[x_test["患有糖尿病标识"]!=2]
len(label_test),label_test

1000


(857,
      性别  出生年份  体重指数   舒张压  口服耐糖量测试  胰岛素释放实验  肱三头肌皮褶厚度  叔叔或姑姑有一方患有糖尿病  无记录  \
 1     0  1998  20.6  68.0    3.861      NaN       NaN              1    0   
 3     0  1999  34.6  66.0    4.684      NaN      3.14              0    1   
 5     0  2003  24.2  58.0    5.939    26.65      0.78              0    0   
 6     0  2002   NaN   NaN    4.028      NaN       NaN              0    1   
 7     0  1999   NaN   NaN    5.116      NaN       NaN              1    0   
 ..   ..   ...   ...   ...      ...      ...       ...            ...  ...   
 995   1  1990  50.1  87.0    5.125      NaN       NaN              0    1   
 996   0  1992  56.3  87.0    7.695      NaN       NaN              0    1   
 997   1  1992  23.8  85.0    3.194     7.50       NaN              0    1   
 998   0  2000  53.1  95.0    8.226     7.55       NaN              0    1   
 999   1  1980  46.9  88.0    4.802      NaN       NaN              0    1   
 
      父母有一方患有糖尿病  患有糖尿病标识  
 1             0        0  


In [7]:
feas_and_label = [i  for i in train.columns.tolist() if i not in ['编号','糖尿病家族史']]
feas = [i  for i in train.columns.tolist() if i not in ['编号','糖尿病家族史','患有糖尿病标识']]

In [8]:
new_train = pd.concat([train[feas_and_label],label_test[feas_and_label]],axis=0)
train = new_train[new_train['患有糖尿病标识'].notnull()].reset_index(drop = True)
test = data[~data['患有糖尿病标识'].notnull()].reset_index(drop = True)

x_train = train[feas]
y_train = train['患有糖尿病标识']
x_test = test[feas]

In [9]:
THR = 0.4 #f1阈值
folds = 10 
seed = 2021
def lgb_model(train_x, train_y, test_x):
    
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    f1_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('----------------------------------- {} -----------------------------------'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]


        train_matrix = lgb.Dataset(trn_x, label=trn_y)
        valid_matrix = lgb.Dataset(val_x, label=val_y)
        fea = pd.DataFrame()

        params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'tree_learner':'serial',
                'metric': 'auc',
                'min_child_weight': 6,
                'num_leaves': 2 ** 6,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.01,
                'seed': 2021,
                'nthread': 28,
                'n_jobs':4,
                'silent': True,
                'verbose': -1,
                'device':'gpu'
            }

        model = lgb.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix], 
                          categorical_feature =[] ,verbose_eval=200,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x:x[1], reverse=True))[:30])

            
        train[valid_index] = val_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        f1_scores.append(f1_score(val_y,[1 if i>THR else 0 for i in val_pred]))
        
        
        print(cv_scores)
        print(f1_scores)
    test = sum(test_pre) / folds
    print(f"s_scotrainre_list:  {cv_scores}")
    print(f"s_auc_score_mean:  {np.mean(cv_scores)}")
    print(f"s_f1_score_mean:  {np.mean(f1_scores)}")
    print(f"s_score_std:  {np.std(cv_scores)}")

    return train, test, Feass


lgb_train, lgb_test ,Feass= lgb_model(x_train, y_train, x_test)

----------------------------------- 1 -----------------------------------
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.997694	valid_1's auc: 0.987189
[400]	training's auc: 0.997965	valid_1's auc: 0.990619
[600]	training's auc: 0.998368	valid_1's auc: 0.99171
Early stopping, best iteration is:
[514]	training's auc: 0.998175	valid_1's auc: 0.992074
[('口服耐糖量测试', 5298), ('体重指数', 4756), ('舒张压', 3464), ('出生年份', 3152), ('肱三头肌皮褶厚度', 1973), ('胰岛素释放实验', 1747), ('性别', 764), ('无记录', 613), ('父母有一方患有糖尿病', 271), ('叔叔或姑姑有一方患有糖尿病', 209)]
[0.9920736880378136]
[0.9429824561403508]
----------------------------------- 2 -----------------------------------
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.997454	valid_1's auc: 0.998073
[400]	training's auc: 0.997753	valid_1's auc: 0.998061
[600]	training's auc: 0.998214	valid_1's auc: 0.997891
Early stopping, best iteration is:
[434]	training's auc: 0.997824	valid_1's auc: 0.998134

[200]	training's auc: 0.997393	valid_1's auc: 0.994155
[400]	training's auc: 0.997747	valid_1's auc: 0.994957
[600]	training's auc: 0.998184	valid_1's auc: 0.995297
[800]	training's auc: 0.998632	valid_1's auc: 0.995406
[1000]	training's auc: 0.998955	valid_1's auc: 0.995528
Did not meet early stopping. Best iteration is:
[998]	training's auc: 0.998955	valid_1's auc: 0.995516
[('体重指数', 6513), ('口服耐糖量测试', 6461), ('舒张压', 4928), ('出生年份', 4362), ('肱三头肌皮褶厚度', 2691), ('胰岛素释放实验', 2379), ('性别', 928), ('无记录', 819), ('父母有一方患有糖尿病', 376), ('叔叔或姑姑有一方患有糖尿病', 230)]
[0.9920736880378136, 0.9981335595685371, 0.9952369409768513, 0.9874075869591564, 0.9948369894558236, 0.9969458247485153, 0.9933462610592656, 0.9969497004387085, 0.9951632700183504, 0.9955156950672646]
[0.9429824561403508, 0.9644444444444444, 0.9601769911504425, 0.9369369369369369, 0.9357798165137613, 0.9647577092511013, 0.9525959367945824, 0.9455337690631809, 0.9551569506726457, 0.9534368070953437]
s_scotrainre_list:  [0.9920736880378136, 

In [10]:
pd.Series([1 if i >0.4 else 0 for i in lgb_test]).value_counts()

0    594
1    406
dtype: int64

In [11]:
sub['label'] = [1 if i >0.4 else 0 for i in lgb_test]
sub.to_csv('./submit/base_9511_406csv',index =False)