In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, os, sys, shutil
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
meta = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

In [None]:
train_init = train.copy()
train['Alpha'] = meta['Alpha']
train['Beta'] = meta['Beta']
train['Gamma'] = meta['Gamma']
train['Delta'] = meta['Delta']

In [None]:
# 处理greeks
# Alpha为A的时候为0，其余为1
train['Alpha'] = train['Alpha'].apply(lambda x: 0 if x == 'A' else 1)

In [None]:
# 将数据集中唯一的离散特征转为01，这可能表明患者的性别
train['EJ'] = train['EJ'].replace({'A': 0, 'B': 1}).astype(float)
test['EJ']  = test['EJ'].replace({'A': 0, 'B': 1}).astype(float)
train_init['EJ'] = train_init['EJ'].replace({'A': 0, 'B': 1}).astype(float)
# 缺失值处理，用中位数填充
train['BQ'].fillna(0, inplace=True)
train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)
train_init.fillna(train_init.median(), inplace=True)
# 移除ID列
train_id = train['Id'].copy()
test_id  =  test['Id'].copy()
train = train.drop(['Id'], axis=1)
test  =  test.drop(['Id'], axis=1)

In [None]:
# 将gamma中的M和N都转为0，G，H，E，F，A，B的转为1
train['Gamma'] = train['Gamma'].replace({'M': 0, 'N': 0, 'G': 1, 'H': 1, 'E': 1, 'F': 1, 'A': 1, 'B': 1})
# 将Beta中的C0,B1,A2
train['Beta'] = train['Beta'].replace({'C': 0, 'B': 1, 'A': 2})
# 将Delta中的B0,A1,C1,D2
train['Delta'] = train['Delta'].replace({'B': 0, 'A': 1, 'C': 1, 'D': 2})

train.head()


In [None]:
# 连续变量归一化处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_columns = [_ for _ in train.columns if _ not in ['EJ', 'Alpha', 'Beta', 'Gamma', 'Delta', 'Class']]
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
test[numeric_columns] = scaler.transform(test[numeric_columns])
train.head()

In [None]:
# 抛弃高度相关的特征
drop_cols = ['BZ','CL','EH','GL']  # 高度相关
drop_cols2 = ['DY','CB','GB','CH','DL','CU','FS','AZ','GE','EG','EP']  # 无用特征，同一点取得极值
drop_cols3 = ['BZ', 'DV', 'EH', 'FD ']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [None]:
# ytrain为离散
ytrain = train[['Class', 'Alpha', 'Beta', 'Gamma', 'Delta']]
train.drop(['Class', 'Alpha', 'Beta', 'Gamma', 'Delta'], axis=1, inplace=True)

In [None]:
train.head()

# 模型搭建
先直接对Class预测

In [None]:
# 数据集分割
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
x_train, x_test, y_train, y_test = train_test_split(train, ytrain, test_size=0.2, random_state=42)
scale_pos_weight = 4.712962962962963
print(x_train.shape)
print(y_train.shape)

In [None]:
def ScoreMetric(ytrue, ypred):
    nc = np.bincount(ytrue);
    return log_loss(ytrue, ypred, sample_weight = 1 / nc[ytrue], eps=1e-15);

In [None]:
from typing import Tuple
import xgboost as xgb
def balancedlogloss(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    ''' balanced log loss metric.'''
    y = dtrain.get_label()
    target_mean = y.mean()
    w0 = 1/(1-target_mean)
    w1 = 1/target_mean
    sample_weight = [w0 if y == 0 else w1 for y in y]
    loss = log_loss(y, predt, eps = 1e-15, sample_weight=sample_weight)
    
    return 'balancedlogloss', loss

In [None]:
from lightgbm import LGBMClassifier

lgbmc = LGBMClassifier(learning_rate=0.005, num_iterations=775, force_col_wise=True)
lgbmc.fit(x_train, y_train['Class'])
lgbmc_pred = lgbmc.predict(x_test)
y_pred = lgbmc.predict_proba(x_test)
lgbmc_accuracy = accuracy_score(lgbmc_pred, y_test['Class'])

lgbmc_accuracy

In [None]:
p0 = y_pred[:,1]
ScoreMetric(y_test['Class'], p0)
# p0

# 正式的KFold交叉验证 + Optuna模型调参

In [None]:
import optuna, tune

from optuna.samplers import TPESampler
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, log_loss
from sklearn.metrics import make_scorer, accuracy_score, log_loss
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

# fold
from sklearn.model_selection import KFold, StratifiedKFold


In [None]:
def objective(trial):
    param = {
        "metric": "binary",
        "random_state": 42,
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 100, 1000),
        "n_estimators": trial.suggest_int("n_estimators", 700, 10000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 10, 300, step=3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 0.7),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 0.7),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1, step=0.1),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.8, 1, step=0.1),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100, step=5),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.9),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0)
    }

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    log_loss_scores = []
    # 划分验证集
    for train_index, val_index in skf.split(train, ytrain['Class']):
        x_train_, x_val = train.iloc[train_index], train.iloc[val_index]
        y_train_, y_val = ytrain['Class'].iloc[train_index], ytrain['Class'].iloc[val_index]

        classifier = LGBMClassifier(**param, scale_pos_weight=scale_pos_weight, verbose=False)
        classifier.fit(x_train_, y_train_, eval_set=[(x_val, y_val)], verbose=False)

        y_pred_proba = classifier.predict_proba(x_val)[:, 1]
        log_loss_scores.append(ScoreMetric(y_val, y_pred_proba))

    return np.mean(log_loss_scores)


if __name__ == "__main__":
    sampler = TPESampler(seed=42)
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
    study = optuna.create_study(pruner=pruner, direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=500)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
def lgb_metric(y_true, y_pred):
    loss = ScoreMetric(y_true, y_pred)
    return 'lgb_metric', loss, False

In [None]:
model = LGBMClassifier(**trial.params)
model.fit(x_train, y_train['Class'], eval_metric=lgb_metric)
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:, 1]
ScoreMetric(y_test['Class'], y_pred_proba)
accuracy_score(y_pred, y_test['Class'])

In [None]:
plot_optimization_history(study)

In [None]:
plot_slice(study)

In [None]:
plot_param_importances(study)

In [None]:
trial.params

{'early_stopping_rounds': 116,
 'n_estimators': 8594,
 'learning_rate': 0.2292825799916429,
 'num_leaves': 190,
 'max_depth': 3,
 'reg_alpha': 0.3086813444028655,
 'reg_lambda': 0.08439961817618014,
 'bagging_fraction': 1.0,
 'bagging_freq': 1,
 'feature_fraction': 1.0,
 'min_child_samples': 75,
 'colsample_bytree': 0.2572293361418775,
 'lambda_l1': 1.246275770846192e-06,
 'lambda_l2': 0.011660417895786973,
 'subsample': 0.7301110313724658}

 