In [16]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
from preTrain import calWeight,MakeParams,getPreprocess
from sklearn.model_selection import StratifiedKFold
from trainer import trainer
from skopt import BayesSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from skopt.callbacks import DeltaYStopper
import pickle
import time

#### Import data

In [17]:
TRAINFILE="Data/train.csv"
TESTFILE="Data/test.csv"

trainDf=pd.read_csv(TRAINFILE)
validDf=pd.read_csv(TESTFILE)

keep_cols=['outcome','basegrd','baseucva','familyHistory','basesef','wearGlass']
cat_cols=['basegrd','familyHistory','basesef','wearGlass']

trainDf=trainDf[keep_cols]
validDf=validDf[keep_cols]

trainDf.dropna(axis=0,how='any',inplace=True)

#### Data split

In [18]:
Xtrain,ytrain=trainDf.iloc[:,1:],trainDf.iloc[:,0]

#### Preprocess

In [12]:
preprocessor=getPreprocess()

#### Fit model

In [13]:
models={}
models['cat']=CatBoostClassifier(eval_metric="BalancedAccuracy",cat_features=cat_cols)
models['ANN']=Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('ANN',MLPClassifier())
        ]
    )
models['Logistic']=Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('Logistic',LogisticRegression(max_iter=1000,solver='lbfgs',class_weight="balanced"))]
    )
models['rf']=RandomForestClassifier(class_weight="balanced_subsample")
models['xgb']=XGBClassifier(scale_pos_weight=calWeight(Xtrain,ytrain))
models['gbm']=LGBMClassifier(scale_pos_weight=calWeight(Xtrain,ytrain))
models['ada']=AdaBoostClassifier()
models['GB']=Pipeline(   steps=[
        ('preprocessor', preprocessor),
        ('GaussionNB',GaussianNB())]
)

In [14]:
params=MakeParams()
cv=StratifiedKFold(10,random_state=1234,shuffle=True)

In [15]:
optimizer_kwargs={
                  "base_estimator":"GBRT",
                  "n_initial_points":20,
                  "initial_point_generator":"random",
                  "acq_func":"LCB",
                  "acq_optimizer":"auto",
                  "n_jobs":-1,
                  "random_state":0,
                  "acq_func_kwargs": {"kappa":1.96}
                  }

In [16]:
mytrainer=trainer(models)
print(mytrainer)
mytrainer.setTrained({'GB':False})

Trainer中包含的模型是:['cat', 'ANN', 'Logistic', 'rf', 'xgb', 'gbm', 'ada', 'GB']
GB已经设置为不进行超参数搜索!


In [17]:
mytrainer.setup(BayesSearchCV,params,cv=cv,scoring="balanced_accuracy",verbose=6,n_jobs=-1,n_iter=50,
                optimizer_kwargs=optimizer_kwargs)
mytrainer.fit(Xtrain,ytrain,callback=[DeltaYStopper(0.01,10)])

共有7个模型,开始训练1个,模型是cat...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [19]:
mytrainer.save_model(file=f'Model_{int(time.time())}.pkl')

In [20]:
mytrainer.get_best_score()

{'cat': 0.832512133751073,
 'ANN': 0.5000914496334639,
 'Logistic': 0.8327342302028704,
 'rf': 0.8332085382157354,
 'xgb': 0.8287023226406859,
 'gbm': 0.8331775682795708,
 'ada': 0.5}

#### Save data

In [19]:
Xtrain.to_pickle("./Data/XtrainForModel.pkl")
ytrain.to_pickle("./Data/ytrainForModel.pkl")
validDf.to_pickle("./Data/validForModel.pkl")

#### Export the model

In [21]:
mytrainer=trainer.load_model("./Model_1708489668.pkl")
catboost=mytrainer.get_best_estimator()['cat']

In [22]:
catboost.save_model("./shinyapp/catboost")