In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, os, sys, shutil
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
meta = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

In [3]:
train_init = train.copy()
train['Alpha'] = meta['Alpha']
train['Beta'] = meta['Beta']
train['Gamma'] = meta['Gamma']
train['Delta'] = meta['Delta']

In [4]:
# 处理greeks
# Alpha为A的时候为0，其余为1
train['Alpha'] = train['Alpha'].apply(lambda x: 0 if x == 'A' else 1)

In [5]:
# 将数据集中唯一的离散特征转为01，这可能表明患者的性别
train['EJ'] = train['EJ'].replace({'A': 0, 'B': 1}).astype(float)
test['EJ']  = test['EJ'].replace({'A': 0, 'B': 1}).astype(float)
train_init['EJ'] = train_init['EJ'].replace({'A': 0, 'B': 1}).astype(float)
# 缺失值处理，用中位数填充
train['BQ'].fillna(0, inplace=True)
train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)
train_init.fillna(train_init.median(), inplace=True)
# 移除ID列
train_id = train['Id'].copy()
test_id  =  test['Id'].copy()
train = train.drop(['Id'], axis=1)
test  =  test.drop(['Id'], axis=1)

In [6]:
# 将gamma中的M和N都转为0，G，H，E，F，A，B的转为1
train['Gamma'] = train['Gamma'].replace({'M': 0, 'N': 0, 'G': 1, 'H': 1, 'E': 1, 'F': 1, 'A': 1, 'B': 1})
# 将Beta中的C0,B1,A2
train['Beta'] = train['Beta'].replace({'C': 0, 'B': 1, 'A': 2})
# 将Delta中的B0,A1,C1,D2
train['Delta'] = train['Delta'].replace({'B': 0, 'A': 1, 'C': 1, 'D': 2})

train.head()


Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,GE,GF,GH,GI,GL,Class,Alpha,Beta,Gamma,Delta
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,72.611063,2003.810319,22.136229,69.834944,0.120343,1,1,0,1,2
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,72.611063,27981.56275,29.13543,32.131996,21.978,0,0,0,0,0
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,88.609437,13676.95781,28.022851,35.192676,0.196941,0,0,0,0,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,82.416803,2094.262452,39.948656,90.493248,0.155829,0,0,0,0,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,146.109943,8524.370502,45.381316,36.262628,0.096614,1,1,1,1,0


In [7]:
# 连续变量归一化处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_columns = [_ for _ in train.columns if _ not in ['EJ', 'Alpha', 'Beta', 'Gamma', 'Delta', 'Class']]
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
test[numeric_columns] = scaler.transform(test[numeric_columns])
train.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,GE,GF,GH,GI,GL,Class,Alpha,Beta,Gamma,Delta
0,-0.572153,-0.170975,-0.261669,-0.237889,-0.189295,-1.900558,-0.083417,-0.173502,-0.038354,-0.405383,...,-0.41026,-0.655511,-0.948991,0.531241,-0.814049,1,1,0,1,2
1,-0.709105,-1.097801,-0.261669,-0.028701,-0.189295,-0.750457,-0.083417,0.678919,-0.104787,0.048541,...,-0.41026,0.687893,-0.238862,-0.509218,1.30486,0,0,0,0,0
2,-0.015212,-0.377169,-0.261669,-0.094845,-0.189295,0.465662,-0.083417,0.519453,-0.104787,-0.071089,...,-0.29921,-0.05185,-0.351743,-0.424754,-0.806623,0,0,0,0,0
3,-0.480851,0.138196,0.012347,0.547477,-0.189295,-0.72961,-0.083417,0.112088,-0.104787,-0.391109,...,-0.342195,-0.650833,0.858232,1.101332,-0.810609,0,0,0,0,0
4,-0.206946,0.100517,-0.261669,-0.356885,-0.189295,-0.628845,-0.013229,-1.649292,1.445139,0.125327,...,0.09992,-0.318309,1.409422,-0.395228,-0.816349,1,1,1,1,0


In [8]:
# 抛弃高度相关的特征
drop_cols = ['BZ','CL','EH','GL']  # 高度相关
drop_cols2 = ['DY','CB','GB','CH','DL','CU','FS','AZ','GE','EG','EP']  # 无用特征，同一点取得极值
drop_cols3 = ['BZ', 'DV', 'EH', 'FD ']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [9]:
# ytrain为离散
ytrain = train[['Class', 'Alpha', 'Beta', 'Gamma', 'Delta']]
train.drop(['Class', 'Alpha', 'Beta', 'Gamma', 'Delta'], axis=1, inplace=True)

In [10]:
train.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI
0,-0.572153,-0.170975,-0.261669,-0.237889,-0.189295,-1.900558,-0.083417,-0.173502,-0.038354,-0.405383,...,-0.112922,-2.226608,0.162821,-0.035806,-0.250437,-0.940094,-0.41026,-0.655511,-0.948991,0.531241
1,-0.709105,-1.097801,-0.261669,-0.028701,-0.189295,-0.750457,-0.083417,0.678919,-0.104787,0.048541,...,-0.311056,0.084542,-0.457925,-0.060566,0.11364,-1.14507,-0.41026,0.687893,-0.238862,-0.509218
2,-0.015212,-0.377169,-0.261669,-0.094845,-0.189295,0.465662,-0.083417,0.519453,-0.104787,-0.071089,...,-0.173811,0.51706,0.198663,-0.051023,0.597343,1.637944,-0.29921,-0.05185,-0.351743,-0.424754
3,-0.480851,0.138196,0.012347,0.547477,-0.189295,-0.72961,-0.083417,0.112088,-0.104787,-0.391109,...,0.058201,1.61716,0.060364,-0.060566,-0.104806,-0.219883,-0.342195,-0.650833,0.858232,1.101332
4,-0.206946,0.100517,-0.261669,-0.356885,-0.189295,-0.628845,-0.013229,-1.649292,1.445139,0.125327,...,0.520331,1.21285,0.237302,0.896815,-0.229632,-0.432313,0.09992,-0.318309,1.409422,-0.395228


# 模型搭建
先直接对Class预测

In [11]:
# 数据集分割
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
x_train, x_test, y_train, y_test = train_test_split(train, ytrain, test_size=0.2, random_state=42)
scale_pos_weight = 4.712962962962963
print(x_train.shape)
print(y_train.shape)

(493, 52)
(493, 5)


In [12]:
def ScoreMetric(ytrue, ypred):
    nc = np.bincount(ytrue);
    return log_loss(ytrue, ypred, sample_weight = 1 / nc[ytrue], eps=1e-15);

In [22]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=25, random_state = 27 ,max_depth = 11)
random_forest.fit(x_train, y_train['Class'])
random_forest_pred = random_forest.predict(x_test)
random_forest_pred_proba = random_forest.predict_proba(x_test)
random_forest_accuracy = accuracy_score(random_forest_pred, y_test['Class'])

print(random_forest_accuracy)

p0 = random_forest_pred_proba[:,1]
print(ScoreMetric(y_test['Class'], p0))
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)

0.9596774193548387
0.41775254422258257


0.3718849346408435

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state = 42)
gbc.fit(x_train, y_train['Class'])
gbc_pred = gbc.predict(x_test)
gbc_pred_proba = gbc.predict_proba(x_test)
gbc_accuracy = accuracy_score(gbc_pred, y_test['Class'])

print(gbc_accuracy)
p0 = gbc_pred_proba[:,1]
print(ScoreMetric(y_test['Class'], p0))
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)

0.9516129032258065
0.32231418952629415


0.19346205691378707

In [29]:
from sklearn.ensemble import HistGradientBoostingClassifier


hist_params = {
            'l2_regularization': 0.01,
            'early_stopping': True,
            'learning_rate': 0.01,
            'max_iter': 4000,
            'max_depth': 4,
            'max_bins': 255,
            'min_samples_leaf': 10,
            'max_leaf_nodes':10,
            'random_state': 42
        } # 0.95   0.20-0.35

# without 0.93   0.14-0.31
hgbc = HistGradientBoostingClassifier()
hgbc.fit(x_train,y_train['Class'])
hgbc_pred = hgbc.predict(x_test)
hgbc_pred_proba = hgbc.predict_proba(x_test)
hgbc_accuracy = accuracy_score(hgbc_pred, y_test['Class'])

print(hgbc_accuracy)
p0 = hgbc_pred_proba[:,1]
print(ScoreMetric(y_test['Class'], p0))
odds = 4.71 * p0 / (1-p0)
p0 = odds / (1+odds)
ScoreMetric(y_test['Class'], p0)


0.9354838709677419
0.31444099718399027


0.14628695148245402

# 正式的KFold交叉验证 + Optuna模型调参

In [15]:
import optuna, tune

from optuna.samplers import TPESampler
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, log_loss
from sklearn.metrics import make_scorer, accuracy_score, log_loss
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

# fold
from sklearn.model_selection import KFold, StratifiedKFold


In [17]:
def objective(trial):
    '''

    '''
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical( "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"] ),
        "used_ram_limit": "3gb",
        "eval_metric": "Accuracy",
        "random_state": 42,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 8000),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 100, 400),
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    log_loss_scores = []
    # 划分验证集
    for train_index, val_index in skf.split(x_train, y_train['Class']):
        x_train_, x_val = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_, y_val = y_train['Class'].iloc[train_index], y_train['Class'].iloc[val_index]
        classifier = CatBoostClassifier(**param, verbose=False)
        classifier.fit(x_train_, y_train_, eval_set=[(x_val, y_val)], verbose=False)
        y_pred_proba = classifier.predict_proba(x_val)[:, 1]
        log_loss_scores.append(ScoreMetric(y_val, y_pred_proba))
    return np.mean(log_loss_scores)

if __name__ == "__main__":
    sampler = TPESampler(seed=42)
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
    study = optuna.create_study(pruner=pruner, direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=500)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2023-08-03 00:04:57,120] A new study created in memory with name: no-name-dc8804f5-5695-41d5-bb68-1d8b4bc0cf22
[W 2023-08-03 00:04:59,381] Trial 0 failed with parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.05395030966670229, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.11114989443094977, 'n_estimators': 262, 'early_stopping_rounds': 391, 'subsample': 0.6798962421591127} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\111\AppData\Local\Programs\Python\Python38\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\111\AppData\Local\Temp\ipykernel_21436\3754446640.py", line 31, in objective
    classifier.fit(x_train_, y_train_, eval_set=[(x_val, y_val)], verbose=False)
  File "c:\Users\111\AppData\Local\Programs\Python\Python38\lib\site-packages\catboost\core.py", line 5131, in fit
    self._fit(X, 

KeyboardInterrupt: 

In [None]:
model = CatBoostClassifier(**trial.params)
model.fit(x_train, y_train['Class'])
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:, 1]
ScoreMetric(y_test['Class'], y_pred_proba)
accuracy_score(y_pred, y_test['Class'])

0.9516129032258065

In [None]:
plot_optimization_history(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
plot_slice(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
plot_param_importances(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

1.
Params: 
    objective: CrossEntropy
    colsample_bylevel: 0.08326847511080138
    depth: 3
    boosting_type: Plain
    bootstrap_type: Bernoulli
    learning_rate: 0.20800532359943255
    n_estimators: 1389
    early_stopping_rounds: 325
    subsample: 0.13113402291704018