In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, os, sys, shutil
warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
meta = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

In [4]:
train_init = train.copy()
train['Alpha'] = meta['Alpha']
train['Beta'] = meta['Beta']
train['Gamma'] = meta['Gamma']
train['Delta'] = meta['Delta']

In [5]:
# 处理greeks
# Alpha为A的时候为0，其余为1
train['Alpha'] = train['Alpha'].apply(lambda x: 0 if x == 'A' else 1)

In [6]:
# 将数据集中唯一的离散特征转为01，这可能表明患者的性别
train['EJ'] = train['EJ'].replace({'A': 0, 'B': 1}).astype(float)
test['EJ']  = test['EJ'].replace({'A': 0, 'B': 1}).astype(float)
train_init['EJ'] = train_init['EJ'].replace({'A': 0, 'B': 1}).astype(float)
# 缺失值处理，用中位数填充
train['BQ'].fillna(0, inplace=True)
train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)
train_init.fillna(train_init.median(), inplace=True)
# 移除ID列
train_id = train['Id'].copy()
test_id  =  test['Id'].copy()
train = train.drop(['Id'], axis=1)
test  =  test.drop(['Id'], axis=1)

In [7]:
# 将gamma中的M和N都转为0，G，H，E，F，A，B的转为1
train['Gamma'] = train['Gamma'].replace({'M': 0, 'N': 0, 'G': 1, 'H': 1, 'E': 1, 'F': 1, 'A': 1, 'B': 1})
# 将Beta中的C0,B1,A2
train['Beta'] = train['Beta'].replace({'C': 0, 'B': 1, 'A': 2})
# 将Delta中的B0,A1,C1,D2
train['Delta'] = train['Delta'].replace({'B': 0, 'A': 1, 'C': 1, 'D': 2})

train.head()


Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,GE,GF,GH,GI,GL,Class,Alpha,Beta,Gamma,Delta
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,72.611063,2003.810319,22.136229,69.834944,0.120343,1,1,0,1,2
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,72.611063,27981.56275,29.13543,32.131996,21.978,0,0,0,0,0
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,88.609437,13676.95781,28.022851,35.192676,0.196941,0,0,0,0,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,82.416803,2094.262452,39.948656,90.493248,0.155829,0,0,0,0,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,146.109943,8524.370502,45.381316,36.262628,0.096614,1,1,1,1,0


In [8]:
# 连续变量归一化处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_columns = [_ for _ in train.columns if _ not in ['EJ', 'Alpha', 'Beta', 'Gamma', 'Delta', 'Class']]
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
test[numeric_columns] = scaler.transform(test[numeric_columns])
train.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,GE,GF,GH,GI,GL,Class,Alpha,Beta,Gamma,Delta
0,-0.572153,-0.170975,-0.261669,-0.237889,-0.189295,-1.900558,-0.083417,-0.173502,-0.038354,-0.405383,...,-0.41026,-0.655511,-0.948991,0.531241,-0.814049,1,1,0,1,2
1,-0.709105,-1.097801,-0.261669,-0.028701,-0.189295,-0.750457,-0.083417,0.678919,-0.104787,0.048541,...,-0.41026,0.687893,-0.238862,-0.509218,1.30486,0,0,0,0,0
2,-0.015212,-0.377169,-0.261669,-0.094845,-0.189295,0.465662,-0.083417,0.519453,-0.104787,-0.071089,...,-0.29921,-0.05185,-0.351743,-0.424754,-0.806623,0,0,0,0,0
3,-0.480851,0.138196,0.012347,0.547477,-0.189295,-0.72961,-0.083417,0.112088,-0.104787,-0.391109,...,-0.342195,-0.650833,0.858232,1.101332,-0.810609,0,0,0,0,0
4,-0.206946,0.100517,-0.261669,-0.356885,-0.189295,-0.628845,-0.013229,-1.649292,1.445139,0.125327,...,0.09992,-0.318309,1.409422,-0.395228,-0.816349,1,1,1,1,0


In [9]:
# 抛弃高度相关的特征
drop_cols = ['BZ','CL','EH','GL']  # 高度相关
drop_cols2 = ['DY','CB','GB','CH','DL','CU','FS','AZ','GE','EG','EP']  # 无用特征，同一点取得极值
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [10]:
# ytrain为离散
ytrain = train[['Class', 'Alpha', 'Beta', 'Gamma', 'Delta']]
train.drop(['Class', 'Alpha', 'Beta', 'Gamma', 'Delta'], axis=1, inplace=True)

In [11]:
train.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI
0,-0.572153,-0.170975,-0.261669,-0.237889,-0.189295,-1.900558,-0.083417,-0.173502,-0.038354,-0.405383,...,-0.112922,-2.226608,0.162821,-0.035806,-0.250437,-0.940094,-0.41026,-0.655511,-0.948991,0.531241
1,-0.709105,-1.097801,-0.261669,-0.028701,-0.189295,-0.750457,-0.083417,0.678919,-0.104787,0.048541,...,-0.311056,0.084542,-0.457925,-0.060566,0.11364,-1.14507,-0.41026,0.687893,-0.238862,-0.509218
2,-0.015212,-0.377169,-0.261669,-0.094845,-0.189295,0.465662,-0.083417,0.519453,-0.104787,-0.071089,...,-0.173811,0.51706,0.198663,-0.051023,0.597343,1.637944,-0.29921,-0.05185,-0.351743,-0.424754
3,-0.480851,0.138196,0.012347,0.547477,-0.189295,-0.72961,-0.083417,0.112088,-0.104787,-0.391109,...,0.058201,1.61716,0.060364,-0.060566,-0.104806,-0.219883,-0.342195,-0.650833,0.858232,1.101332
4,-0.206946,0.100517,-0.261669,-0.356885,-0.189295,-0.628845,-0.013229,-1.649292,1.445139,0.125327,...,0.520331,1.21285,0.237302,0.896815,-0.229632,-0.432313,0.09992,-0.318309,1.409422,-0.395228


# 模型搭建
先直接对Class预测

In [12]:
# 数据集分割
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
x_train, x_test, y_train, y_test = train_test_split(train, ytrain, test_size=0.2, random_state=42)
scale_pos_weight = 4.712962962962963
print(x_train.shape)
print(y_train.shape)

(493, 52)
(493, 5)


In [13]:
def ScoreMetric(ytrue, ypred):
    nc = np.bincount(ytrue);
    return log_loss(ytrue, ypred, sample_weight = 1 / nc[ytrue], eps=1e-15);

In [14]:
from typing import Tuple
import xgboost as xgb
def balancedlogloss(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    ''' balanced log loss metric.'''
    y = dtrain.get_label()
    target_mean = y.mean()
    w0 = 1/(1-target_mean)
    w1 = 1/target_mean
    sample_weight = [w0 if y == 0 else w1 for y in y]
    loss = log_loss(y, predt, eps = 1e-15, sample_weight=sample_weight)
    
    return 'balancedlogloss', loss

In [15]:
from lightgbm import LGBMClassifier

params = {
    'lambda_l1': 0.0004754363672821333,
    'lambda_l2': 1.088904998340126e-06,
    'num_leaves': 142,
    'feature_fraction': 0.8491732535462826,
    'bagging_fraction': 0.8744449358064078,
    'bagging_freq': 1,
    'min_child_samples': 17,
    'learning_rate': 0.03
}# 0.95  0.28-0.24
# params = {
# #  'early_stopping_rounds': 116,
#  'n_estimators': 8594,
#  'learning_rate': 0.05,   #0.2292825799916429
#  'num_leaves': 190,
#  'max_depth': 3,
#  'reg_alpha': 0.3086813444028655,
#  'reg_lambda': 0.08439961817618014,
#  'bagging_fraction': 1.0,
#  'bagging_freq': 1,
#  'feature_fraction': 1.0,
#  'min_child_samples': 75,
#  'colsample_bytree': 0.2572293361418775,
#  'lambda_l1': 1.246275770846192e-06,
#  'lambda_l2': 0.011660417895786973,
#  'subsample': 0.7301110313724658
#  } # 0.95   0.20-0.30


lgbmc = LGBMClassifier(**params, objective='binary', random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight)
lgbmc.fit(x_train, y_train['Class'])
lgbmc_pred = lgbmc.predict(x_test)
lgbmc_pred_proba = lgbmc.predict_proba(x_test)
lgbmc_accuracy = accuracy_score(y_test['Class'], lgbmc_pred)

print(lgbmc_accuracy)

p0 = lgbmc_pred_proba[:,1]
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)

[LightGBM] [Info] Number of positive: 85, number of negative: 408
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6769
[LightGBM] [Info] Number of data points in the train set: 493, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172414 -> initscore=-1.568616
[LightGBM] [Info] Start training from score -1.568616
0.9516129032258065


0.2458758311429148

In [16]:
from catboost import CatBoost, CatBoostClassifier
'''
    objective: CrossEntropy
    learning_rate: 0.025
    colsample_bylevel: 0.062247778696758224
    depth: 7
    boosting_type: Plain
    bootstrap_type: MVS
'''
params = {
    'objective': 'CrossEntropy',
    'learning_rate': 0.025,
    'colsample_bylevel': 0.062247778696758224,
    'depth': 7,
    'boosting_type': 'Plain',
    'bootstrap_type': 'MVS'
}# 0.95   0.20-0.36
cat = CatBoostClassifier(**params, random_state=42, verbose=False)
cat.fit(x_train, y_train['Class'])
cat_pred = cat.predict(x_test)
cat_pred_proba = cat.predict_proba(x_test)
cat_accuracy = accuracy_score(cat_pred, y_test['Class'])

print(cat_accuracy)

p0 = cat_pred_proba[:,1]
# odds = 4.71 * p0 / (1-p0)
# p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)

0.9516129032258065


0.36323159773627756

In [20]:
# # xgb
import xgboost as xgb
'''

'''
params = {
    'booster': 'dart',
    'lambda': 0.00012146610908121476,
    'alpha': 0.09188910047137025,
    'subsample': 0.748192621773776,
    'colsample_bytree': 0.597791819904349,
    'learning_rate': 0.02,
    'max_depth': 5,
    'min_child_weight': 2,
    'eta': 9.089678475059372e-06,
    'gamma': 0.0002474690481043904,
    'grow_policy': 'lossguide',
    'sample_type': 'weighted',
    'normalize_type': 'forest',
    'rate_drop': 5.319578466457059e-06,
    'skip_drop': 0.03392570004595852
} # 0.18
params = {'booster': 'gbtree',
 'lambda': 0.8068353722333764,
 'alpha': 1.894027813634802e-08,
 'subsample': 0.7785041239304065,
 'colsample_bytree': 0.7601163387370006,
 'learning_rate': 0.02,
 'max_depth': 4,
 'eta': 0.645702469472196,
 'gamma': 3.1878972195087093e-07,
 'grow_policy': 'lossguide'
 } # 0.20
 
params = {'booster': 'dart',
 'lambda': 0.0067981943191443815,
 'alpha': 1.1158232780616973e-07,
 'subsample': 0.39680328510099894,
 'colsample_bytree': 0.5482579412080295,
 'learning_rate': 0.13779959534970157,
 'max_depth': 3,
 'eta': 0.0010017919482385215,
 'gamma': 0.030524056409888256,
 'grow_policy': 'depthwise',
 'sample_type': 'weighted',
 'normalize_type': 'tree',
 'rate_drop': 4.4093900571337765e-06,
 'skip_drop': 1.105656318625032e-05
 } # 0.18 - 0.24      0.94

xgb_c = xgb.XGBClassifier(**params, n_estimators=300, random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight)
# xgb_c = xgb.XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth=4, random_state=42)
xgb_c.fit(x_train, y_train['Class'])
xgb_pred = xgb_c.predict(x_test)
y_pred = xgb_c.predict_proba(x_test)
xgb_accuracy = accuracy_score(xgb_pred, y_test['Class'])
# xgb_pred
print(xgb_accuracy)

p0 = y_pred[:,1]
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)

0.9435483870967742


0.18769516107387163

In [18]:
y_pred = cat.predict_proba(test)
p0 = y_pred[:,0]
submission = pd.DataFrame(test_id, columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('./icr-identify-age-related-conditions/submission.csv', index=False)
submission_df = pd.read_csv('./icr-identify-age-related-conditions/submission.csv')
submission_df


Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.901326,0.098674
1,010ebe33f668,0.901326,0.098674
2,02fa521e1838,0.901326,0.098674
3,040e15f562a2,0.901326,0.098674
4,046e85c7cc7f,0.901326,0.098674


# 正式的KFold交叉验证 + Optuna模型调参

In [19]:
# this is how you define custom metric in XGboost
