## Kaggle競賽針對保單是否進行索賠的預測 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score,roc_curve,auc
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OrdinalEncoder

df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
df_train.head(10)

In [None]:
df_train.describe()

## 切分特徵與目標

In [None]:
x = df_train.drop(['id','claim'],axis=1)
y = df_train['claim']

df_test.drop(['id'],axis=1,inplace=True)

## 新增特徵工程

In [None]:
from scipy import stats

x['missing_count'] = x[x.columns].isnull().sum(axis=1)
df_test['missing_count'] = df_test[df_test.columns].isna().sum(axis=1)

x['std'] = x[x.columns].std(axis=1)
df_test['std'] = df_test[df_test.columns].std(axis=1)

x['min'] = x[x.columns].min(axis=1)
df_test['min'] = df_test[df_test.columns].min(axis=1)

#x['mean'] = x[x.columns].mean(axis=1)
#df_test['mean'] = df_test[df_test.columns].mean(axis=1)

#x['median'] = x[x.columns].median(axis=1)
#df_test['median'] = df_test[df_test.columns].median(axis=1)

x['sem'] = x[x.columns].sem(axis=1)
df_test['sem'] = df_test[df_test.columns].sem(axis=1)

#x['skew'] = x[x.columns].skew(axis=1)
#df_test['skew'] = df_test[df_test.columns].skew(axis=1)

In [None]:
x

## 針對偏斜資料進行處理

In [None]:
skew_fea = x.skew()
skew_fea = [*skew_fea[abs(skew_fea.values) > 1].index]

for feat in skew_fea:
    median = x[feat].median()
    x[feat] = x[feat].fillna(median)
    

    

In [None]:
skew_fea = df_test.skew()
skew_fea = [*skew_fea[abs(skew_fea.values) > 1].index]

for feat in skew_fea:
    median = df_test[feat].median()
    df_test[feat] = df_test[feat].fillna(median)
    



## 填補缺失數值

In [None]:
im = SimpleImputer(strategy='mean')
im_x = pd.DataFrame(im.fit_transform(x))
im_test = pd.DataFrame(im.transform(df_test))

im_x.columns = x.columns
im_test.columns = df_test.columns

## 進行資料縮放

In [None]:
from sklearn.preprocessing import RobustScaler

ss = StandardScaler()
im_x = ss.fit_transform(im_x)
im_test = ss.transform(im_test)

## 模型參數設定

In [None]:
best_params = {
    'iterations': 18000, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    #'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 5, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    #'verbose' : 0
}

In [None]:
xgb_params = {
             'max_depth': 3, 
             'learning_rate': 0.021537077920105466, 
             'n_estimators': 15000, 
             'min_child_weight': 150, 
             'gamma': 0.11611920725914951, 
             'alpha': 0.0021839958087869794, 
             'lambda': 0.0018567979557499344,
             'colsample_bytree': 0.7139742731494992,
             'subsample': 0.6258627743440968,
             'tree_method': 'gpu_hist',
             'booster': 'gbtree',
             #'random_state': 228,
             'use_label_encoder': False,
             'eval_metric': 'auc'
             }

## XGB與Catboost模型進行投票法模型融合(使用Kfold切分資料)


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

n_split = 5


pred_test_lc = 0
pred_vote = 0

#cbr = CatBoostClassifier(**best_params)
#lgb = LGBMClassifier(**lgbm_params)
#xgb = XGBClassifier(**xgb_params)

xgb_0 = XGBClassifier(**xgb_params, random_state=17)
xgb_1 = XGBClassifier(**xgb_params, random_state=19)
xgb_2 = XGBClassifier(**xgb_params, random_state=75)
xgb_3 = XGBClassifier(**xgb_params, random_state=124)
xgb_4 = XGBClassifier(**xgb_params, random_state=39)
xgb_5 = XGBClassifier(**xgb_params, random_state=55)
cbc_6 = CatBoostClassifier(**best_params, random_state=67)
cbc_7 = CatBoostClassifier(**best_params, random_state=41)
cbc_8 = CatBoostClassifier(**best_params, random_state=85)
cbc_9 = CatBoostClassifier(**best_params, random_state=97)

estimators = [('xgb_0',xgb_0),('xgb_1',xgb_1),('xgb_2',xgb_2),('xgb_3',xgb_3),('xgb_4',xgb_4),('xgb_5',xgb_5),
              ('cbc_6',cbc_6),('cbc_7',cbc_7),('cbc_8',cbc_8),('cbc_9',cbc_9)]
  
    
for fold,(i,j) in enumerate(KFold(n_splits=n_split,shuffle=True,random_state=1).split(im_x,y)):
    
    x_train , y_train = im_x[i] , y[i]
    x_valid , y_valid = im_x[j] , y[j]
    
    model = VotingClassifier(estimators=estimators,voting='soft',verbose=True)
    model.fit(x_train,y_train)
    
    pred = model.predict_proba(x_valid)
    scores_lc = roc_auc_score(y_valid, pred[:,1])
    print(f"Fold_lc_{fold}:{scores_lc}")
    
    
    pred_vote = pred_vote + model.predict_proba(im_test)[:,1]

    
    
pred_vote = pred_vote / n_split
    
print(f'Avg xgb_voting score:{np.mean(scores_lc)}')      
    

In [None]:
pred_vote

In [None]:
solution = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
result = pd.DataFrame()
result['id'] = solution['id']
result['claim'] = pred_vote
result

In [None]:
#result.to_csv('submit_25.csv',index=False)