In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression            
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier as xgb
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [4]:
# 분류 score 계산
def score(pred, y_test):
    print(' accuracy  : ', accuracy_score(y_test, pred))
    print(' f1-score  : ', f1_score(y_test, pred))
    print(' recall    : ', recall_score(y_test, pred))
    print(' precision : ', precision_score(y_test, pred))
    return accuracy_score(y_test, pred), f1_score(y_test, pred), recall_score(y_test, pred), precision_score(y_test, pred)


def logistic_reg(x_train, y_train, x_valid, y_valid):
    
    # 모델 분류 수행
    reg = LogisticRegression(random_state=0, max_iter=500)
    reg.fit(x_train, y_train)
    pred = reg.predict(x_valid)   
    
    # 분류 score 계산
    accuracy, f1, recall, precision = score(pred, y_valid)
    return accuracy, f1, recall, precision


def randomforest_clf(x_train, y_train, x_valid, y_valid):
    
    # 모델 분류 수행
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_valid)
    
    # 분류 score 계산
    accuracy, f1, recall, precision = score(pred, y_valid)
    return accuracy, f1, recall, precision


def xgb_clf(x_train, y_train, x_valid, y_valid):
    # 분류 모델 생성
    model = xgb(use_label_encoder=False, 
                          eval_metric='mlogloss',
                          random_state=0)
    
    # 모델 분류 수행    
    model.fit(x_train, y_train) 
    pred = model.predict(x_valid)

    # 분류 score 계산
    accuracy, f1, recall, precision = score(pred, y_valid)
    return accuracy, f1, recall, precision

def cat_clf(x_train, y_train, x_valid, y_valid):
    
    # 분류 모델 생성    
    clf_catboost = CatBoostClassifier(iterations=100, random_state=0)
    
    # 모델 분류 수행    
    clf_catboost.fit(x_train, y_train)
    pred = clf_catboost.predict(x_valid)

    # 분류 score 계산
    accuracy, f1, recall, precision = score(pred, y_valid)
    return accuracy, f1, recall, precision

In [None]:
def data_split(df):
    feature = df.iloc[:,:-1]      # 피처 칼럼: 예측에 사용되는 데이터
    target  = df['phishing']      # 타겟 칼럼: 예측(분류) 대상 데이터
    x_train, x_valid, y_train, y_valid = train_test_split(feature, target, test_size=0.2,
                                                          shuffle=True,      
                                                          stratify=target,  
                                                          random_state=1234) 
    return x_train, x_valid, y_train, y_valid, feature, target

# data loading

In [7]:
# ver1 - label encoding
h1n1_ver1_train = pd.read_csv('./data/ver1/train_h1n1_ver1.csv')
h1n1_ver1_test = pd.read_csv('./data/ver1/test_h1n1_ver1.csv')
seasonal_ver1_train = pd.read_csv('./data/ver1/train_seasonal_ver1.csv')
seasonal_ver1_test = pd.read_csv('./data/ver1/test_seasonal_ver1.csv')

# ver2 - one-hot encoding
h1n1_ver2_train = pd.read_csv('./data/ver2/train_h1n1_ver2.csv')
h1n1_ver2_test = pd.read_csv('./data/ver2/test_h1n1_ver2.csv')
seasonal_ver2_train = pd.read_csv('./data/ver2/train_seasonal_ver2.csv')
seasonal_ver2_test = pd.read_csv('./data/ver2/test_seasonal_ver2.csv')

# ver1
## ver1 - h1n1

In [14]:
target1 = h1n1_ver1_train['h1n1_vaccine']
feature1 = h1n1_ver1_train.iloc[:,1:-1]

x_train, x_valid, y_train, y_valid = train_test_split(feature1, target1, test_size=0.2,
                                                          shuffle=True,      
                                                          stratify=target1,  
                                                          random_state=1234) 

In [15]:
print('----------------logistic regression result-------------------')
logistic_reg(x_train, y_train, x_valid, y_valid)
print('----------------randomforest result-------------------')
randomforest_clf(x_train, y_train, x_valid, y_valid)
print('----------------xgboost result-------------------')
xgb_clf(x_train, y_train, x_valid, y_valid)
print('----------------catboost result-------------------')
cat_clf(x_train, y_train, x_valid, y_valid)

----------------logistic regression result-------------------
 accuracy  :  0.8076923076923077
 f1-score  :  0.39823566477630745
 recall    :  0.29727187206020694
 precision :  0.6030534351145038
----------------randomforest result-------------------
 accuracy  :  0.8097060008054773
 f1-score  :  0.3977055449330784
 recall    :  0.29350893697083724
 precision :  0.616600790513834
----------------xgboost result-------------------
Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


 accuracy  :  0.8042690293999194
 f1-score  :  0.4355400696864112
 recall    :  0.3527751646284102
 precision :  0.5690440060698028
----------------catboost result-------------------
Learning rate set to 0.304923
0:	learn: 0.5443411	total: 155ms	remaining: 15.3s
1:	learn: 0

(0.8052758759565042, 0.411442483262325, 0.3179680150517404, 0.5827586206896552)

## ver1 - seasonal

In [30]:
target2 = seasonal_ver1_train['seasonal_vaccine']
feature2 = seasonal_ver1_train.iloc[:,1:-1]

x_train, x_valid, y_train, y_valid = train_test_split(feature2, target2, test_size=0.2,
                                                          shuffle=True,      
                                                          stratify=target2,  
                                                          random_state=1234) 

In [31]:
print('----------------logistic regression result-------------------')
logistic_reg(x_train, y_train, x_valid, y_valid)
print('----------------randomforest result-------------------')
randomforest_clf(x_train, y_train, x_valid, y_valid)
print('----------------xgboost result-------------------')
xgb_clf(x_train, y_train, x_valid, y_valid)
print('----------------catboost result-------------------')
cat_clf(x_train, y_train, x_valid, y_valid)

----------------logistic regression result-------------------
 accuracy  :  0.743108270075909
 f1-score  :  0.7197035745422843
 recall    :  0.7046521553563807
 precision :  0.7354120267260579
----------------randomforest result-------------------
 accuracy  :  0.7401118657610867
 f1-score  :  0.7158768290019656
 recall    :  0.6995305164319249
 precision :  0.733005366726297
----------------xgboost result-------------------
Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


 accuracy  :  0.7425089892129445
 f1-score  :  0.7186203885614494
 recall    :  0.7025181391378574
 precision :  0.7354781054512958
----------------catboost result-------------------
Learning rate set to 0.305956
0:	learn: 0.5976297	total: 7.83ms	remaining: 776ms
1:	learn: 0.55

(0.7483020375549341, 0.7252507631923245, 0.7097737942808365, 0.741417744092733)

# ver2
## ver2 - h1n1

In [19]:
target3 = h1n1_ver2_train['h1n1_vaccine']
feature3 = h1n1_ver2_train.drop(['h1n1_vaccine','respondent_id'], axis=1)

x_train, x_valid, y_train, y_valid = train_test_split(feature3, target3, test_size=0.2,
                                                          shuffle=True,      
                                                          stratify=target3,  
                                                          random_state=1234) 

In [25]:
print('----------------logistic regression result-------------------')
logistic_reg(x_train, y_train, x_valid, y_valid)
print('----------------randomforest result-------------------')
randomforest_clf(x_train, y_train, x_valid, y_valid)
print('----------------xgboost result-------------------')
xgb_clf(x_train, y_train, x_valid, y_valid)
print('----------------catboost result-------------------')
cat_clf(x_train, y_train, x_valid, y_valid)

----------------logistic regression result-------------------
 accuracy  :  0.8032621828433347
 f1-score  :  0.38359621451104103
 recall    :  0.28598306679209784
 precision :  0.5823754789272031
----------------randomforest result-------------------
 accuracy  :  0.8066854611357229
 f1-score  :  0.3861892583120205
 recall    :  0.28410159924741296
 precision :  0.6027944111776448
----------------xgboost result-------------------
Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


 accuracy  :  0.7994361659283126
 f1-score  :  0.42093023255813955
 recall    :  0.3405456255879586
 precision :  0.5509893455098934
----------------catboost result-------------------
Learning rate set to 0.304923
0:	learn: 0.5442133	total: 11.3ms	remaining: 1.12s
1:	learn

(0.8086991542488925,
 0.4304556354916067,
 0.3377234242709313,
 0.5933884297520661)

## ver2 - seasonal

In [32]:
target4 = seasonal_ver2_train['seasonal_vaccine']
feature4 = seasonal_ver2_train.drop(['seasonal_vaccine','respondent_id'], axis=1)

x_train, x_valid, y_train, y_valid = train_test_split(feature4, target4, test_size=0.2,
                                                          shuffle=True,      
                                                          stratify=target4,  
                                                          random_state=1234) 

In [33]:
print('----------------logistic regression result-------------------')
logistic_reg(x_train, y_train, x_valid, y_valid)
print('----------------randomforest result-------------------')
randomforest_clf(x_train, y_train, x_valid, y_valid)
print('----------------xgboost result-------------------')
xgb_clf(x_train, y_train, x_valid, y_valid)
print('----------------catboost result-------------------')
cat_clf(x_train, y_train, x_valid, y_valid)

----------------logistic regression result-------------------
 accuracy  :  0.7445065920894927
 f1-score  :  0.7210468920392583
 recall    :  0.70550576184379
 precision :  0.7372881355932204
----------------randomforest result-------------------
 accuracy  :  0.7437075509388733
 f1-score  :  0.719440192433851
 recall    :  0.7020913358941528
 precision :  0.7376681614349776
----------------xgboost result-------------------
Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


 accuracy  :  0.7439073112265282
 f1-score  :  0.7211831230969986
 recall    :  0.7076397780623133
 precision :  0.7352549889135255
----------------catboost result-------------------
Learning rate set to 0.305956
0:	learn: 0.5983290	total: 7.08ms	remaining: 701ms
1:	learn: 0.559

(0.7499001198561726, 0.730520878174774, 0.7242851045667947, 0.7368649587494572)

# Model result

- ver1
    - h1n1 : <br>xgboost best (accuracy:0.8042690293999194, f1-score:0.4355400696864112, recall:0.3527751646284102, precision:0.5690440060698028)
    - seasonal : <br>catboost best (accuracy:0.7483020375549341, f1-score:0.7252507631923245, recall:0.7097737942808365, precision:0.741417744092733)

- ver2
    - h1n1 : <br>catboost best (accuracy:0.8086991542488925, f1-score:0.4304556354916067, recall:0.3377234242709313, precision:0.5933884297520661)
    - seasonal : <br>catboost best (accuracy:0.7499001198561726, f1-score:0.730520878174774, recall:0.7242851045667947, precision:0.7368649587494572)

# Best Model Prediction

In [37]:
train_y_h1n1_1 = h1n1_ver1_train['h1n1_vaccine']
train_x_h1n1_1 = h1n1_ver1_train.iloc[:,1:-1]
test_x_h1n1_1 = h1n1_ver1_test.iloc[:,1:]

train_y_seasonal_1 = seasonal_ver1_train['seasonal_vaccine']
train_x_seasonal_1 = seasonal_ver1_train.iloc[:,1:-1]
test_x_seasonal_1 = seasonal_ver1_test.iloc[:,1:]

In [43]:
model1 = xgb(use_label_encoder=False, eval_metric='mlogloss',random_state=0)
model1.fit(train_x_h1n1_1, train_y_h1n1_1) 
pred1 = model1.predict(test_x_h1n1_1)
pred1_proba = model1.predict_proba(test_x_h1n1_1)

model2 = CatBoostClassifier(iterations=100, random_state=0)
model2.fit(train_x_seasonal_1, train_y_seasonal_1) 
pred2 = model2.predict(test_x_seasonal_1)
pred2_proba = model2.predict_proba(test_x_seasonal_1)

Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Learning rate set to 0.336548
0:	learn: 0.5949892	total: 14.8ms	remaining: 1.46s
1:	learn: 0.5520656	total: 32.9ms	remaining: 1.61s
2:	learn: 0.5317258	total: 49.4ms	remaining: 1.6s
3:	learn: 0.5206595	total: 64.8ms	remaining: 1.55s
4:	learn: 0.5130707	total: 81.6ms	remaining: 1.55s
5:	learn: 0.5084427	total: 98.3ms	remaining: 1.54s
6:	learn: 0.5047069	total: 116ms	remaining: 1.54s
7:	learn: 0.5020006	total: 137ms	remaining: 1.57s
8:	learn: 0.5002122	total: 158ms	remaining: 1.59s
9:	learn: 0.4985802	total: 191ms	remaining: 1.72s
10:	learn: 0.4970445	total: 212ms	remaining: 1.72s
11:	learn: 0.4963959	total: 228ms	remaining: 1.67s
12:	learn: 0.4953400	total: 248ms	remaining: 1.66s
13:	learn: 0.494501

In [45]:
ver1_result = pd.DataFrame({'respondent_id' : h1n1_ver1_test['respondent_id'],
                            'h1n1_vaccine' : pred1_proba[:,1],
                            'seasonal_vaccine' : pred2_proba[:,1]})

In [47]:
ver1_result.to_csv('best_ml_ver1_result.csv', index=False)

In [48]:
train_y_h1n1_2 = h1n1_ver2_train['h1n1_vaccine']
train_x_h1n1_2 = h1n1_ver2_train.drop(['h1n1_vaccine','respondent_id'], axis=1)
test_x_h1n1_2 = h1n1_ver2_test.drop('respondent_id', axis=1)

train_y_seasonal_2 = seasonal_ver2_train['seasonal_vaccine']
train_x_seasonal_2 = seasonal_ver2_train.drop(['seasonal_vaccine','respondent_id'], axis=1)
test_x_seasonal_2 = seasonal_ver2_test.drop('respondent_id', axis=1)

In [49]:
model3 = CatBoostClassifier(iterations=100, random_state=0)
model3.fit(train_x_h1n1_2, train_y_h1n1_2) 
pred3 = model3.predict(test_x_h1n1_2)
pred3_proba = model3.predict_proba(test_x_h1n1_2)

model4 = CatBoostClassifier(iterations=100, random_state=0)
model4.fit(train_x_seasonal_2, train_y_seasonal_2) 
pred4 = model4.predict(test_x_seasonal_2)
pred4_proba = model4.predict_proba(test_x_seasonal_2)

Learning rate set to 0.335409
0:	learn: 0.5348053	total: 18.5ms	remaining: 1.83s
1:	learn: 0.4715559	total: 37.4ms	remaining: 1.83s
2:	learn: 0.4465121	total: 56.7ms	remaining: 1.83s
3:	learn: 0.4337070	total: 75.3ms	remaining: 1.81s
4:	learn: 0.4262650	total: 93.4ms	remaining: 1.77s
5:	learn: 0.4227936	total: 110ms	remaining: 1.72s
6:	learn: 0.4203630	total: 126ms	remaining: 1.68s
7:	learn: 0.4186697	total: 146ms	remaining: 1.68s
8:	learn: 0.4168687	total: 166ms	remaining: 1.68s
9:	learn: 0.4149234	total: 185ms	remaining: 1.66s
10:	learn: 0.4135983	total: 205ms	remaining: 1.65s
11:	learn: 0.4123742	total: 227ms	remaining: 1.66s
12:	learn: 0.4113527	total: 255ms	remaining: 1.71s
13:	learn: 0.4102810	total: 274ms	remaining: 1.68s
14:	learn: 0.4090708	total: 296ms	remaining: 1.67s
15:	learn: 0.4077761	total: 314ms	remaining: 1.65s
16:	learn: 0.4064838	total: 333ms	remaining: 1.63s
17:	learn: 0.4058283	total: 353ms	remaining: 1.6s
18:	learn: 0.4049692	total: 371ms	remaining: 1.58s
19:	lea

68:	learn: 0.4529494	total: 1.33s	remaining: 597ms
69:	learn: 0.4522629	total: 1.35s	remaining: 578ms
70:	learn: 0.4514429	total: 1.36s	remaining: 558ms
71:	learn: 0.4505593	total: 1.38s	remaining: 538ms
72:	learn: 0.4500677	total: 1.4s	remaining: 518ms
73:	learn: 0.4493801	total: 1.42s	remaining: 499ms
74:	learn: 0.4484795	total: 1.44s	remaining: 481ms
75:	learn: 0.4478366	total: 1.46s	remaining: 461ms
76:	learn: 0.4471766	total: 1.48s	remaining: 442ms
77:	learn: 0.4463975	total: 1.5s	remaining: 423ms
78:	learn: 0.4457914	total: 1.52s	remaining: 405ms
79:	learn: 0.4451722	total: 1.54s	remaining: 386ms
80:	learn: 0.4446415	total: 1.56s	remaining: 366ms
81:	learn: 0.4439679	total: 1.58s	remaining: 347ms
82:	learn: 0.4435011	total: 1.6s	remaining: 327ms
83:	learn: 0.4428917	total: 1.62s	remaining: 308ms
84:	learn: 0.4422657	total: 1.63s	remaining: 288ms
85:	learn: 0.4417206	total: 1.65s	remaining: 269ms
86:	learn: 0.4411181	total: 1.67s	remaining: 249ms
87:	learn: 0.4405227	total: 1.69s	

In [50]:
ver2_result = pd.DataFrame({'respondent_id' : h1n1_ver2_test['respondent_id'],
                            'h1n1_vaccine' : pred3_proba[:,1],
                            'seasonal_vaccine' : pred4_proba[:,1]})

In [51]:
ver2_result.to_csv('best_ml_ver2_result.csv', index=False)