In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, os, sys, shutil
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
meta = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

In [3]:
train_init = train.copy()
train['Alpha'] = meta['Alpha']
train['Beta'] = meta['Beta']
train['Gamma'] = meta['Gamma']
train['Delta'] = meta['Delta']

In [4]:
# 处理greeks
# Alpha为A的时候为0，其余为1
train['Alpha'] = train['Alpha'].apply(lambda x: 0 if x == 'A' else 1)

In [5]:
# 将数据集中唯一的离散特征转为01，这可能表明患者的性别
train['EJ'] = train['EJ'].replace({'A': 0, 'B': 1}).astype(float)
test['EJ']  = test['EJ'].replace({'A': 0, 'B': 1}).astype(float)
train_init['EJ'] = train_init['EJ'].replace({'A': 0, 'B': 1}).astype(float)
# 缺失值处理，用中位数填充
train['BQ'].fillna(0, inplace=True)
train.fillna(train.median(), inplace=True)
test['BQ'].fillna(0, inplace=True)
test.fillna(test.median(), inplace=True)
train_init.fillna(train_init.median(), inplace=True)
# 移除ID列
train_id = train['Id'].copy()
test_id  =  test['Id'].copy()
train = train.drop(['Id'], axis=1)
test  =  test.drop(['Id'], axis=1)

In [6]:
# 将gamma中的M和N都转为0，G，H，E，F，A，B的转为1
train['Gamma'] = train['Gamma'].replace({'M': 0, 'N': 0, 'G': 1, 'H': 1, 'E': 1, 'F': 1, 'A': 1, 'B': 1})
# 将Beta中的C0,B1,A2
train['Beta'] = train['Beta'].replace({'C': 0, 'B': 1, 'A': 2})
# 将Delta中的B0,A1,C1,D2
train['Delta'] = train['Delta'].replace({'B': 0, 'A': 1, 'C': 1, 'D': 2})

# train.head()


In [7]:
# 连续变量归一化处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_columns = [_ for _ in train.columns if _ not in ['EJ', 'Alpha', 'Beta', 'Gamma', 'Delta', 'Class']]
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
test[numeric_columns] = scaler.transform(test[numeric_columns])
# train.head()

In [8]:
# 抛弃高度相关的特征
drop_cols = ['BZ','CL','EH','GL']  # 高度相关
drop_cols2 = ['DY','CB','GB','CH','DL','CU','FS','AZ','GE','EG','EP']  # 无用特征，同一点取得极值
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [9]:
# ytrain为离散
ytrain = train[['Class', 'Alpha', 'Beta', 'Gamma', 'Delta']]
train.drop(['Class', 'Alpha', 'Beta', 'Gamma', 'Delta'], axis=1, inplace=True)

In [10]:
# train.head()

# 模型搭建
先直接对Class预测

In [11]:
# 数据集分割
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
x_train, x_test, y_train, y_test = train_test_split(train, ytrain, test_size=0.2, random_state=42)
scale_pos_weight = 4.712962962962963
# print(x_train.shape)
# print(y_train.shape)

In [12]:
def ScoreMetric(ytrue, ypred):
    nc = np.bincount(ytrue);
    return log_loss(ytrue, ypred, sample_weight = 1 / nc[ytrue], eps=1e-15);

In [13]:
from lightgbm import LGBMClassifier

lgb_p1 = {
    'lambda_l1': 0.0004754363672821333,
    'lambda_l2': 1.088904998340126e-06,
    'num_leaves': 142,
    'feature_fraction': 0.8491732535462826,
    'bagging_fraction': 0.8744449358064078,
    'bagging_freq': 1,
    'min_child_samples': 17,
    'learning_rate': 0.03
}# 0.95  0.28-0.24
lgb_p2 = {
    #  'early_stopping_rounds': 116,
    'n_estimators': 8594,
    'learning_rate': 0.05,   #0.2292825799916429
    'num_leaves': 190,
    'max_depth': 3,
    'reg_alpha': 0.3086813444028655,
    'reg_lambda': 0.08439961817618014,
    'bagging_fraction': 1.0,
    'bagging_freq': 1,
    'feature_fraction': 1.0,
    'min_child_samples': 75,
    'colsample_bytree': 0.2572293361418775,
    'lambda_l1': 1.246275770846192e-06,
    'lambda_l2': 0.011660417895786973,
    'subsample': 0.7301110313724658
 } # 0.95   0.20-0.30

lgb1 = LGBMClassifier(**lgb_p1, random_state=42, n_jobs=-1, class_weight='balanced', scale_pos_weight=scale_pos_weight)
lgb2 = LGBMClassifier(**lgb_p2, random_state=42, n_jobs=-1, class_weight='balanced', scale_pos_weight=scale_pos_weight)

In [14]:
# lgb1.fit(x_train, y_train['Class'])
# lgb_pred = lgb1.predict(x_test)
# y_pred = lgb1.predict_proba(x_test)
# lgb_accuracy = accuracy_score(lgb_pred, y_test['Class'])
# # xgb_pred
# print(lgb_accuracy)
# p0 = y_pred[:,1]
# print(ScoreMetric(y_test['Class'], p0))
# odds = 4.71 * p0 / (1-p0)
# p0 = odds / (1+odds)
# ScoreMetric(y_test['Class'], p0)

In [15]:
from catboost import CatBoost, CatBoostClassifier

cat_p1 = {
    'objective': 'CrossEntropy',
    'learning_rate': 0.025,
    'colsample_bylevel': 0.062247778696758224,
    'depth': 7,
    'boosting_type': 'Plain',
    'bootstrap_type': 'MVS'
}# 0.95   0.20-0.36
cat_p2 = {
    'objective': 'CrossEntropy',
    'colsample_bylevel': 0.08326847511080138,
    'depth': 3,
    'boosting_type': 'Plain',
    'bootstrap_type': 'Bernoulli',
    'learning_rate': 0.20800532359943255,
    'n_estimators': 1389,
    'early_stopping_rounds': 325,
    'subsample': 0.13113402291704018
}# 0.95   0.19-0.32

cat1 = CatBoostClassifier(**cat_p1, random_state=42)
cat2 = CatBoostClassifier(**cat_p2, random_state=42)

In [16]:
# cat2.fit(x_train, y_train['Class'])
# cat_pred = cat2.predict(x_test)
# y_pred = cat2.predict_proba(x_test)
# cat_accuracy = accuracy_score(cat_pred, y_test['Class'])
# # xgb_pred
# print(cat_accuracy)
# p0 = y_pred[:,1]
# print(ScoreMetric(y_test['Class'], p0))
# odds = 4.71 * p0 / (1-p0)
# p0 = odds / (1+odds)
# ScoreMetric(y_test['Class'], p0)

In [17]:
# # xgb
import xgboost as xgb

xgb_p1 = {
    'booster': 'dart',
    'lambda': 0.00012146610908121476,
    'alpha': 0.09188910047137025,
    'subsample': 0.748192621773776,
    'colsample_bytree': 0.597791819904349,
    'learning_rate': 0.02,
    'max_depth': 3,
    'n_estimators': 4199,
    # 'min_child_weight': 2,
    'eta': 9.089678475059372e-06,
    'gamma': 0.0002474690481043904,
    'grow_policy': 'lossguide',
    # 'sample_type': 'weighted',
    # 'normalize_type': 'forest',
    # 'rate_drop': 5.319578466457059e-06,
    # 'skip_drop': 0.03392570004595852
} # 0.18
xgb_p2 = {
 'booster': 'gbtree',
 'lambda': 0.8068353722333764,
 'alpha': 1.894027813634802e-08,
 'subsample': 0.7785041239304065,
 'colsample_bytree': 0.7601163387370006,
 'learning_rate': 0.02,
 'max_depth': 4,
 'n_estimators': 4199,
 'eta': 0.645702469472196,
 'gamma': 3.1878972195087093e-07,
 'grow_policy': 'lossguide'
 } # 0.20
xgb_p3 = {'booster': 'dart',
 'lambda': 0.0067981943191443815,
 'alpha': 1.1158232780616973e-07,
 'subsample': 0.39680328510099894,
 'colsample_bytree': 0.5482579412080295,
 'learning_rate': 0.13779959534970157,
 'max_depth': 3,
#  'n_estimators': 4199,
 'eta': 0.0010017919482385215,
 'gamma': 0.030524056409888256,
 'grow_policy': 'depthwise',
 'sample_type': 'weighted',
 'normalize_type': 'tree',
 'rate_drop': 4.4093900571337765e-06,
 'skip_drop': 1.105656318625032e-05
 } # 0.18 - 0.24      0.94
xgb_p4 = {'booster': 'gbtree',
 'lambda': 0.07422190240236703,
 'alpha': 7.880746325817019e-07,
 'subsample': 0.33813336467190913,
 'colsample_bytree': 0.42650163528146484,
 'learning_rate': 0.05647760311555794,
 'n_estimators': 4199,
 'max_depth': 3,
 'eta': 0.3559784707584994,
 'gamma': 0.35002840740272007,
 'grow_policy': 'lossguide'
 } # 0.95  0.15-0.21
xgb_p1 = {
 'lambda': 0.0067981943191443815,
 'alpha': 1.1158232780616973e-07,
 'learning_rate': 0.13779959534970157,
 'max_depth': 3,
 } # 0.13 - 0.18      0.97
xgb1 = xgb.XGBClassifier(**xgb_p1, random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight, tree_method='gpu_hist')
xgb2 = xgb.XGBClassifier(**xgb_p2, random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight, tree_method='gpu_hist')
xgb3 = xgb.XGBClassifier(**xgb_p3, random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight, tree_method='gpu_hist')
xgb4 = xgb.XGBClassifier(**xgb_p4, random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight, tree_method='gpu_hist')

In [18]:
# xgb1.fit(x_train, y_train['Class'])
# xgb_pred = xgb1.predict(x_test)
# y_pred = xgb1.predict_proba(x_test)
# xgb_accuracy = accuracy_score(xgb_pred, y_test['Class'])
# # xgb_pred
# print(xgb_accuracy)
# p0 = y_pred[:,1]
# print(ScoreMetric(y_test['Class'], p0))
# odds = 4.71 * p0 / (1-p0)
# p0 = odds / (1+odds)
# ScoreMetric(y_test['Class'], p0)

In [19]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42)

* Stacking Model

In [20]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import StackingClassifier

In [21]:
# version1  only xgb lgm cat lr
# hard 不支持概率 0.96 0.20-0.15  0.15-0.19
voting_clf = VotingClassifier(estimators=[('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('lgb1', lgb1), ('lgb2', lgb2), ('cat1', cat1), ('cat2', cat2)], voting='soft', n_jobs=-1)
voting_clf.fit(x_train, y_train['Class'])
voting_pred = voting_clf.predict(x_test)
voting_pred_proba = voting_clf.predict_proba(x_test)
voting_accuracy = accuracy_score(voting_pred, y_test['Class'])
print('voting_accuracy: ', voting_accuracy)

p0 = voting_pred_proba[:,1]
print(ScoreMetric(y_test['Class'], p0))
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)

XGBoostError: [16:16:53] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0fdc6d574b9c0d168-1\xgboost\xgboost-ci-windows\src\gbm\gbtree.cc:625: Check failed: common::AllVisibleGPUs() >= 1 (0 vs. 1) : No visible GPU is found for XGBoost.

In [None]:
# use_features_in_secondary=True 0.15-0.17   0.92
stackCV_sclf = StackingCVClassifier(classifiers=[xgb1, xgb2, xgb3, xgb4, lgb1, lgb2, cat1, cat2], meta_classifier= lr, cv=10, use_features_in_secondary=True, random_state=42, n_jobs=-1)
stackCV_sclf.fit(x_train, y_train['Class'])
scv_clf_pred = stackCV_sclf.predict(x_test)
scv_clf_pred_proba = stackCV_sclf.predict_proba(x_test)
sclf_accuracy = accuracy_score(scv_clf_pred, y_test['Class'])
print('sclf_accuracy: ', sclf_accuracy)

p0 = scv_clf_pred_proba[:,1]
print(ScoreMetric(y_test['Class'], p0))
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)

NameError: name 'StackingCVClassifier' is not defined

In [None]:
# passthrough = False  0.95  0.22-0.18   0.17-0.19
# passthrough = True   0.9  0.15-0.17
estimators = [('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('lgb1', lgb1), ('lgb2', lgb2), ('cat1', cat1), ('cat2', cat2)]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=lr, n_jobs=-1, passthrough = True)
stacking_clf.fit(x_train, y_train['Class'])
stacking_pred = stacking_clf.predict(x_test)
stacking_accuracy = accuracy_score(stacking_pred, y_test['Class'])
print(stacking_accuracy)

p0 = stacking_clf.predict_proba(x_test)[:,1]
print(ScoreMetric(y_test['Class'], p0))
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)

0.9516129032258065
0.17143600452894756


0.15408614482098063

In [None]:
# 三个集成模型的再集成  0.97   0.18-0.15
estimators = [('voting_clf_soft', voting_clf), ('stackCV_sclf', stackCV_sclf), ('stacking_clf', stacking_clf)]
final_voting1 = VotingClassifier(estimators=estimators, voting='soft',  n_jobs=-1)
final_voting1.fit(x_train, y_train['Class'])
voting_pred = final_voting1.predict(x_test)
voting_accuracy = accuracy_score(voting_pred, y_test['Class'])
print(voting_accuracy)

p0 = final_voting1.predict_proba(x_test)[:,1]
print(ScoreMetric(y_test['Class'], p0))
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)


0.9596774193548387
0.17266364089222488


0.14491426851071937

In [None]:
y_pred = stackCV_sclf.predict_proba(test)
p0 = y_pred[:,1]
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
submission = pd.DataFrame(test_id, columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('./icr-identify-age-related-conditions/submission.csv', index=False)
submission_df = pd.read_csv('./icr-identify-age-related-conditions/submission.csv')
submission_df




Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.365323,0.634677
1,010ebe33f668,0.365323,0.634677
2,02fa521e1838,0.365323,0.634677
3,040e15f562a2,0.365323,0.634677
4,046e85c7cc7f,0.365323,0.634677


# 正式的KFold交叉验证 + Optuna模型调参

In [None]:
# this is how you define custom metric in XGboost
