In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
train_data = pd.read_pickle('train.pkl')
train_data.head()

Unnamed: 0,Purchase_work,Use_A,Use_B,Use_C,Use_D,Total_visit_n(Sequential ID),Work(hash),Personal_i_7,Personal_i_8,Personal_i_9,...,User_ten_i_157,User_ten_i_158,User_ten_i_159,User_ten_i_160,User_ten_i_161,User_ten_i_162,User_ten_i_163,User_ten_i_164,User_ten_i_165,User_ten_i_166
0,0,0,0,0,1,1,001C9D9B,90D8AB70,1,,...,,0.5,,,,,0.5,,,
1,0,0,0,0,1,1,001C9D9B,ABC21E80,1,,...,,0.0187,0.0047,,,,0.0287,0.0055,,
2,0,0,0,0,1,1,001C9D9B,C17967D1,0,69EF2C8F,...,,,,,,,0.0186,,,
3,0,0,0,0,1,1,002B4BDE,AF145784,0,,...,,0.0207,,,,,0.2805,0.0692,,
4,0,0,0,0,1,1,002B4BDE,DC2D76A1,1,,...,,,,,,,0.1245,,,


### Preprocessing
* 유저 성향 부분 NaN : 
    * 0으로 채우기(NaN값이 구매 기록 없다는 뜻 이므로 0으로 채우기)
* Hash값, 개인정보 지우기
    * get_dummies로 처리할 경우 너무많은 feature들이 생겨버리기 때문에 좋지 않음.
    * 그래서 Hash값 지우기
* 주요 작품 구매 여부
    * 10~109까지 주요 작품 구매 여부
        * 구매한 적이 있다면 1 없으면 0

In [3]:
train_data_copy = train_data.copy()
train_data_copy = train_data_copy.drop(['Work(hash)', 
                                        'Personal_i_7',
                                        'Personal_i_8',
                                        'Personal_i_9'], axis=1)
train_data_copy.fillna(0, inplace=True)
train_data_copy.head()

Unnamed: 0,Purchase_work,Use_A,Use_B,Use_C,Use_D,Total_visit_n(Sequential ID),Major_work_purchase_10,Major_work_purchase_11,Major_work_purchase_12,Major_work_purchase_13,...,User_ten_i_157,User_ten_i_158,User_ten_i_159,User_ten_i_160,User_ten_i_161,User_ten_i_162,User_ten_i_163,User_ten_i_164,User_ten_i_165,User_ten_i_166
0,0,0,0,0,1,1,0,0,0,0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
1,0,0,0,0,1,1,0,1,0,0,...,0.0,0.0187,0.0047,0.0,0.0,0.0,0.0287,0.0055,0.0,0.0
2,0,0,0,0,1,1,0,34,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0186,0.0,0.0,0.0
3,0,0,0,0,1,1,0,0,0,0,...,0.0,0.0207,0.0,0.0,0.0,0.0,0.2805,0.0692,0.0,0.0
4,0,0,0,0,1,1,3,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1245,0.0,0.0,0.0


In [4]:
train_data_copy.iloc[:, 6:106] = \
train_data_copy.iloc[:, 6:106].applymap(lambda x : 1 if x != 0 else 0)
train_data_copy.iloc[:, 6:106].head()

Unnamed: 0,Major_work_purchase_10,Major_work_purchase_11,Major_work_purchase_12,Major_work_purchase_13,Major_work_purchase_14,Major_work_purchase_15,Major_work_purchase_16,Major_work_purchase_17,Major_work_purchase_18,Major_work_purchase_19,...,Major_work_purchase_100,Major_work_purchase_101,Major_work_purchase_102,Major_work_purchase_103,Major_work_purchase_104,Major_work_purchase_105,Major_work_purchase_106,Major_work_purchase_107,Major_work_purchase_108,Major_work_purchase_109
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Exp.1 : Hash값 제외한 데이터 정보를 다 사용하고 sampling
* Model : XGBClassifier, RandomForest, AdaboostClassifier, BaggingClassifier, ExtraTreeClassifier, LogisticRegression, GradientBoostingClassifier
* Cross-validation : StratifiedKfold


In [5]:
def sampling(df, model_list, model_name_list):
    seed = [333,444,555,666,777]
    for i in seed:
        print("="*20)
        print("seed 값 : %d" % i)
        print("="*20)
        sampling_data = df.loc[np.random.choice(df.index,100000)]
        X_feature = sampling_data.iloc[:, 1:]
        target = sampling_data.iloc[:, 0]
        
        print("Cross Validation을 시작합니다")
        for _, (clf, clf_name) in enumerate(zip(model_list, model_name_list)):
            if clf_name == 'LogisticRegression':
                scl = StandardScaler()
                X_feature_scale = scl.fit_transform(X_feature)
                scores = cross_val_score(clf, X_feature_scale, target, 
                                                scoring='roc_auc', cv=5, 
                                                n_jobs=-1)
                print("- [{}] Model : roc_auc_score : {:.3f} +/- {:.3f}".format(clf_name, 
                                                                         np.mean(scores),
                                                                         np.std(scores)))
            else:
                scores = cross_val_score(clf, X_feature, target, 
                                                scoring='roc_auc', cv=5, 
                                                n_jobs=-1)
                print("- [{}] Model : roc_auc_score : {:.3f} +/- {:.3f}".format(clf_name, 
                                                                       np.mean(scores),
                                                                       np.std(scores)))
model_1 = xgb.XGBClassifier()
model_2 = RandomForestClassifier()
model_3 = AdaBoostClassifier()
model_4 = BaggingClassifier()
model_5 = ExtraTreesClassifier()
model_6 = LogisticRegression()
model_7 = GradientBoostingClassifier()
model_list = [model_1, model_2, model_3, model_4, model_5, model_6, model_7]
model_name_list = ['XGBClassifier', 'RandomForestClassifier', 
                   'AdaBoostClassifier', 'BaggingClassifier', 
                   'ExtraTreesClassifier', 'LogisticRegression',
                   'GradientBoostingClassifier']

sampling(train_data_copy, model_list, model_name_list)

seed 값 : 333
Cross Validation을 시작합니다
- [XGBClassifier] Model : roc_auc_score : 0.831 +/- 0.004
- [RandomForestClassifier] Model : roc_auc_score : 0.875 +/- 0.004
- [AdaBoostClassifier] Model : roc_auc_score : 0.798 +/- 0.003
- [BaggingClassifier] Model : roc_auc_score : 0.879 +/- 0.004
- [ExtraTreesClassifier] Model : roc_auc_score : 0.879 +/- 0.002
- [LogisticRegression] Model : roc_auc_score : 0.789 +/- 0.003
- [GradientBoostingClassifier] Model : roc_auc_score : 0.832 +/- 0.004
seed 값 : 444
Cross Validation을 시작합니다
- [XGBClassifier] Model : roc_auc_score : 0.831 +/- 0.004
- [RandomForestClassifier] Model : roc_auc_score : 0.877 +/- 0.002
- [AdaBoostClassifier] Model : roc_auc_score : 0.796 +/- 0.004
- [BaggingClassifier] Model : roc_auc_score : 0.880 +/- 0.002
- [ExtraTreesClassifier] Model : roc_auc_score : 0.881 +/- 0.001
- [LogisticRegression] Model : roc_auc_score : 0.789 +/- 0.004
- [GradientBoostingClassifier] Model : roc_auc_score : 0.832 +/- 0.004
seed 값 : 555
Cross Validatio

## Exp.2 : Hash값 제외한 데이터, Feature 중요도에 따른 데이터 사용, sampling
* Model : XGBClassifier, RandomForest, AdaboostClassifier, BaggingClassifier, ExtraTreeClassifier, LogisticRegression, GradientBoostingClassifier
* Cross-validation : StratifiedKfold

In [17]:
def sampling_2(df, model_list, model_name_list):
    seed = [333,444,555,666,777]
    for i in seed:
        print("="*20)
        print("seed 값 : %d" % i)
        print("="*20)
        sampling_data = df.loc[np.random.choice(df.index,100000)]
        X_feature = sampling_data.iloc[:, 1:]
        target = sampling_data.iloc[:, 0]
        rf = RandomForestClassifier()
        rf.fit(X_feature, target)
        importance = rf.feature_importances_
        indices = np.argsort(importance)[::-1]
        
        use_column_list=[]
        for f in range(X_feature.shape[1]):
            if importance[indices[f]] > 0.01:
                use_column_list.append(X_feature.columns[indices[f]])
        X_feature_use = X_feature[use_column_list]
        
        print("Cross Validation을 시작합니다")
        for _, (clf, clf_name) in enumerate(zip(model_list, model_name_list)):
            if clf_name == 'LogisticRegression':
                scl = StandardScaler()
                X_feature_scale = scl.fit_transform(X_feature_use)
                scores = cross_val_score(clf, X_feature_scale, target, 
                                                scoring='roc_auc', cv=5, 
                                                n_jobs=-1)
                print("- [{}] Model : roc_auc_score : {:.3f} +/- {:.3f}".format(clf_name, 
                                                                         np.mean(scores),
                                                                         np.std(scores)))
            else:
                scores = cross_val_score(clf, X_feature_use, target, 
                                                scoring='roc_auc', cv=5, 
                                                n_jobs=-1)
                print("- [{}] Model : roc_auc_score : {:.3f} +/- {:.3f}".format(clf_name, 
                                                                       np.mean(scores),
                                                                       np.std(scores)))
model_1 = xgb.XGBClassifier()
model_2 = RandomForestClassifier()
model_3 = AdaBoostClassifier()
model_4 = BaggingClassifier()
model_5 = ExtraTreesClassifier()
model_6 = LogisticRegression()
model_7 = GradientBoostingClassifier()
model_list = [model_1, model_2, model_3, model_4, model_5, model_6, model_7]
model_name_list = ['XGBClassifier', 'RandomForestClassifier', 
                   'AdaBoostClassifier', 'BaggingClassifier', 
                   'ExtraTreesClassifier', 'LogisticRegression',
                   'GradientBoostingClassifier']

sampling(train_data_copy, model_list, model_name_list)

seed 값 : 333
Cross Validation을 시작합니다
- [XGBClassifier] Model : roc_auc_score : 0.831 +/- 0.003
- [RandomForestClassifier] Model : roc_auc_score : 0.874 +/- 0.003
- [AdaBoostClassifier] Model : roc_auc_score : 0.796 +/- 0.003
- [BaggingClassifier] Model : roc_auc_score : 0.878 +/- 0.003
- [ExtraTreesClassifier] Model : roc_auc_score : 0.879 +/- 0.002
- [LogisticRegression] Model : roc_auc_score : 0.788 +/- 0.004
- [GradientBoostingClassifier] Model : roc_auc_score : 0.832 +/- 0.004
seed 값 : 444
Cross Validation을 시작합니다
- [XGBClassifier] Model : roc_auc_score : 0.831 +/- 0.002
- [RandomForestClassifier] Model : roc_auc_score : 0.874 +/- 0.002
- [AdaBoostClassifier] Model : roc_auc_score : 0.798 +/- 0.002
- [BaggingClassifier] Model : roc_auc_score : 0.878 +/- 0.002
- [ExtraTreesClassifier] Model : roc_auc_score : 0.878 +/- 0.003
- [LogisticRegression] Model : roc_auc_score : 0.788 +/- 0.003
- [GradientBoostingClassifier] Model : roc_auc_score : 0.833 +/- 0.001
seed 값 : 555
Cross Validatio

* 실험 결과 ExtraTreesClassifier의 퍼포먼스가 좋기 때문에 ExtraTreesClassifier 모델사용

In [19]:
sampling_data = train_data_copy.loc[np.random.choice(train_data_copy.index,100000)]
X_feature = sampling_data.iloc[:, 1:]
target = sampling_data.iloc[:, 0]
rf = RandomForestClassifier()
rf.fit(X_feature, target)
importance = rf.feature_importances_
indices = np.argsort(importance)[::-1]

use_column_list=[]
for f in range(X_feature.shape[1]):
    if importance[indices[f]] > 0.01:
        use_column_list.append(X_feature.columns[indices[f]])
X_feature_use = X_feature[use_column_list]

# Model Tuning

In [27]:
use_model = ExtraTreesClassifier()

def grid_search(X, y, clf, parameters, n_jobs=-1, cv=3):
    gs_clf = GridSearchCV(clf, parameters, n_jobs = n_jobs, cv = cv, scoring='roc_auc')
    gs_clf = gs_clf.fit(X, y)
    print('Best Parameter :', gs_clf.best_params_)
    print('Model Score : {}'.format(gs_clf.best_score_))
parameters = {
    'n_estimators' : [120,300,500,800,1200],
    'max_depth' : [5, 8, 15, 25, 30, None],
    'max_features' : ['log2', 'sqrt', None]
}
grid_search(X_feature_use, target, use_model, parameters)

Best Parameter : {'n_estimators': 1200, 'max_depth': 30, 'max_features': 'log2'}
Model Score : 0.8762892246892292


# Test Data 적용

In [28]:
test_data = pd.read_csv("lezhin_dataset_v2_test_without_label.tsv.gz", sep='\t')
test_data.head()

Unnamed: 0,0,1,0.1,0.2,960,A01A0380,9359046F,0.3,Unnamed: 8,0.4,...,Unnamed: 156,0.1167,Unnamed: 158,Unnamed: 159,Unnamed: 160,Unnamed: 161,Unnamed: 162,Unnamed: 163,Unnamed: 164,Unnamed: 165
0,1,0,0,0,57,0365FD34,C7D1D97F,1,747D3F97,0,...,,0.2412,,,,,0.3282,0.1485,,
1,0,1,0,0,107,4847D13D,ABC21E80,1,,0,...,,0.4064,,,,,0.1556,0.0675,,
2,0,0,0,1,4,43DC88EC,3000F084,1,1F077A16,0,...,,,,,,,,,,
3,0,1,0,0,388,4847D13D,4A70A3EC,1,,1,...,,0.0736,,,,,0.2358,0.0412,,
4,0,1,0,0,414,20F07591,DE0AB7B1,1,1B668ED8,0,...,,0.307,,,,,0.2577,0.1026,,


In [29]:
test_data.columns = train_data.columns[1:]
test_data.head()

Unnamed: 0,Use_A,Use_B,Use_C,Use_D,Total_visit_n(Sequential ID),Work(hash),Personal_i_7,Personal_i_8,Personal_i_9,Major_work_purchase_10,...,User_ten_i_157,User_ten_i_158,User_ten_i_159,User_ten_i_160,User_ten_i_161,User_ten_i_162,User_ten_i_163,User_ten_i_164,User_ten_i_165,User_ten_i_166
0,1,0,0,0,57,0365FD34,C7D1D97F,1,747D3F97,0,...,,0.2412,,,,,0.3282,0.1485,,
1,0,1,0,0,107,4847D13D,ABC21E80,1,,0,...,,0.4064,,,,,0.1556,0.0675,,
2,0,0,0,1,4,43DC88EC,3000F084,1,1F077A16,0,...,,,,,,,,,,
3,0,1,0,0,388,4847D13D,4A70A3EC,1,,1,...,,0.0736,,,,,0.2358,0.0412,,
4,0,1,0,0,414,20F07591,DE0AB7B1,1,1B668ED8,0,...,,0.307,,,,,0.2577,0.1026,,


### Test data Preprocessing

In [30]:
final_test_data = test_data.drop(['Work(hash)',
                                  'Personal_i_7',
                                  'Personal_i_8',
                                  'Personal_i_9'], axis=1)
final_test_data = final_test_data[X_feature_use.columns]
final_test_data.fillna(0, inplace=True)
final_test_data.head()

Unnamed: 0,User_ten_i_163,User_ten_i_152,Total_visit_n(Sequential ID),User_ten_i_155,User_ten_i_153,User_ten_i_164,Work_issued_time(sequential ID),Total_issued_episode_num(sequential ID),User_ten_i_158,User_ten_i_156,Final_epi_issued time(sequential ID),Genre_i_124,User_ten_i_159,Needed_coin,User_ten_i_151,User_ten_i_154,Genre_i_125,Sche_i_120,Work_tag_i_110,conclusion
0,0.3282,0.2095,57,0.0,0.0,0.1485,17,3,0.2412,0.0726,19,0,0.0,3,0.0,0.0,0,0,0,0
1,0.1556,0.0,107,0.0,0.0,0.0675,18,2,0.4064,0.192,19,0,0.0,3,0.1784,0.0,0,0,0,0
2,0.0,0.3254,4,0.0,0.6746,0.0,12,5,0.0,0.0,18,1,0.0,3,0.0,0.0,0,1,1,0
3,0.2358,0.6494,388,0.0,0.0,0.0412,18,2,0.0736,0.0,19,0,0.0,3,0.0,0.0,0,0,0,0
4,0.2577,0.0,414,0.0,0.0,0.1026,7,17,0.307,0.2589,19,0,0.0,3,0.0738,0.0,0,0,0,0


### 모델에 테스트 데이터 적용

In [41]:
final_clf = ExtraTreesClassifier(n_estimators = 1200, 
                                 max_depth = 30, 
                                 max_features = 'log2')
final_clf.fit(train_data_copy[X_feature_use.columns], 
              train_data_copy.iloc[:, 0])

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=30, max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [42]:
predict = final_clf.predict(final_test_data)
submission = pd.DataFrame({"Purchase" : predict})
submission.head()

Unnamed: 0,Purchase
0,0
1,0
2,0
3,0
4,0
