**현재 세션(1개)의 모든 클릭 로그**를 대상으로 MLP, Gaussian Naive Bayes, Decision Tree, XGBoost, Logistic Regression, Linear SVM을 사용해서 구매 예측

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import itertools
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# <br>
# 전처리

In [2]:
# 불러오기
import pandas as pd

온라인 = pd.read_csv('온라인_전처리_final.csv')

# 각 clnt_id별 session이 바뀌는 지점 index 저장
idx1 = 온라인.unique_id.drop_duplicates().index.tolist()
idx2 = idx1[1:] + [len(온라인)]

# <br>
# 종속변수 생성
현재 세션의 구매 여부

In [3]:
# 각 hit_seq 당 action_type이 구매완료=1, 이외=0
온라인['buy'] = 온라인['action_type_6']
온라인.drop('action_type_6', inplace=True, axis=1)
print(온라인.shape)
온라인.head()

(3196362, 29)


Unnamed: 0,clnt_id,sess_id,trans_id,hit_seq,hit_pss_tm,time_length,holiday_diff,keyword,action_type_1,action_type_2,...,trfc_src_PORTAL_2,trfc_src_PORTAL_3,trfc_src_PUSH,trfc_src_WEBSITE,trfc_src_unknown,dvc_ctg_nm_mobile_app,dvc_ctg_nm_mobile_web,dvc_ctg_nm_unknown,unique_id,buy
0,1,1,,1,12.0,11.0,0,1,0,0,...,0,0,0,0,1,1,0,0,1_1,0
1,1,1,,2,23.0,14.0,0,2,0,0,...,0,0,0,0,1,1,0,0,1_1,0
2,1,1,,3,37.0,14.0,0,3,0,0,...,0,0,0,0,1,1,0,0,1_1,0
3,1,2,,1,42.0,15.0,1,1,0,0,...,0,0,0,0,1,1,0,0,1_2,0
4,1,2,,2,57.0,14.0,1,2,0,0,...,0,0,0,0,1,1,0,0,1_2,0


In [13]:
# 현재 세션의 구매를 예측하기 위한 종속변수 생성
구매여부 = 온라인[['clnt_id', 'sess_id', 'buy']].groupby(['clnt_id', 'sess_id']).sum()
구매여부.buy = 구매여부.buy.apply(lambda x:0 if x == 0 else 1)
구매여부 = 구매여부.sort_index()
구매여부 = 구매여부.reset_index()

In [15]:
구매여부['unique_id'] = list(map(lambda x,y: str(x)+'_'+str(y), 구매여부.clnt_id, 구매여부.sess_id))

In [16]:
구매여부

Unnamed: 0,clnt_id,sess_id,buy,unique_id
0,1,1,0,1_1
1,1,2,0,1_2
2,2,1,1,2_1
3,2,2,0,2_2
4,2,3,0,2_3
...,...,...,...,...
367144,72428,2,0,72428_2
367145,72428,3,0,72428_3
367146,72428,4,0,72428_4
367147,72428,5,0,72428_5


In [17]:
# 클릭 10개 미만 세션들은 삭제
temp = pd.DataFrame(온라인.groupby('unique_id')['hit_seq'].count())
클릭10개미만 = temp[temp.hit_seq < 10].index
클릭10개미만

Index(['10000_1', '10000_2', '10001_1', '10003_1', '10004_1', '10004_2',
       '10005_1', '10005_2', '10005_3', '10005_4',
       ...
       '9_24', '9_28', '9_29', '9_3', '9_30', '9_31', '9_4', '9_7', '9_8',
       '9_9'],
      dtype='object', name='unique_id', length=280908)

In [18]:
온라인2 = 온라인[~온라인.unique_id.isin(클릭10개미만)].copy()

In [19]:
# 10개 그 뒤의 hit_seq 삭제
온라인2 = 온라인2[온라인2.hit_seq < 11].copy()

In [20]:
온라인2.drop(['clnt_id', 'sess_id', 'trans_id', 'buy'], axis=1, inplace=True)

In [21]:
def to_flat(df):
    cc = df.groupby(['unique_id']).cumcount() + 1
    flat_df = df.set_index(['unique_id', cc]).unstack().sort_index(1, level=1)
    flat_df.columns = ['_'.join(map(str,i)) for i in flat_df.columns]
    flat_df.reset_index(inplace=True)
    return flat_df

In [22]:
온라인2 = to_flat(온라인2)

In [23]:
온라인2 = 온라인2.merge(구매여부, left_on='unique_id', right_on='unique_id')

In [24]:
온라인2.sort_values(by=['clnt_id','sess_id'], inplace=True)

In [25]:
온라인2.head()

Unnamed: 0,unique_id,action_type_1_1,action_type_2_1,action_type_3_1,action_type_4_1,action_type_5_1,action_type_7_1,biz_unit_A02_1,biz_unit_A03_1,dvc_ctg_nm_mobile_app_1,...,time_length_10,trfc_src_PORTAL_1_10,trfc_src_PORTAL_2_10,trfc_src_PORTAL_3_10,trfc_src_PUSH_10,trfc_src_WEBSITE_10,trfc_src_unknown_10,clnt_id,sess_id,buy
26105,2_1,0,0,0,0,0,0,0,1,0,...,2.0,0,0,0,0,0,0,2,1,1
26110,2_8,0,0,0,0,1,0,0,1,0,...,4.0,0,0,0,0,0,0,2,8,1
26106,2_10,0,0,0,0,0,0,0,1,0,...,5.0,0,0,0,0,0,0,2,10,1
26107,2_15,0,1,0,0,0,0,0,1,0,...,2.0,0,0,0,0,0,0,2,15,0
26108,2_16,1,0,0,0,0,0,0,1,0,...,2.0,0,0,0,0,0,0,2,16,0


# <br>
# [1-3]현재 세션(1개)의 앞 부분의 10개 클릭 로그 사용
(session, sequence, variables) 3d array를 1d array로 변환하여 활용

In [32]:
from keras.preprocessing import sequence
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [26]:
features = 온라인2.columns[1:-3]

In [28]:
온라인_x = 온라인2.iloc[:, 1:-3]
온라인_x = np.array(온라인_x)

In [29]:
# session 당 구매 여부
온라인_y = 온라인2.buy

In [33]:
def make_padding_and_oversample(X, Y, length=10):
    max_len = length
    smote = SMOTE(random_state=0)
    X_resampled, Y_resampled = smote.fit_resample(X, Y)
    return X_resampled, Y_resampled

In [34]:
X_resampled, Y_resampled = make_padding_and_oversample(온라인_x, 온라인_y, length=10)



In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, Y_resampled, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

(81713, 240)
(35021, 240)


# <br>
# 모델링

In [36]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import xgboost as xgb

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.layers import Dropout

import pickle
from joblib import dump, load

In [37]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [38]:
scores = dict()

### Gaussian Naive Bayes

In [39]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

Accuracy: 0.57
F1 score: 0.65
Precision: 0.55
Recall: 0.80


In [40]:
scores['Gaussian Naive Bayes'] = [accuracy_score(y_test, y_pred), f1_score(y_test, y_pred),
                                  precision_score(y_test, y_pred), recall_score(y_test, y_pred)]

In [41]:
# dump(clf, '1-3-Gaussian.joblib')

### Decision Tree

In [42]:
clf2 = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
y_pred = clf2.predict(X_test)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

Accuracy: 0.72
F1 score: 0.72
Precision: 0.71
Recall: 0.72


In [43]:
scores['Decision Tree'] = [accuracy_score(y_test, y_pred), f1_score(y_test, y_pred),
                           precision_score(y_test, y_pred), recall_score(y_test, y_pred)]

In [44]:
feature_check = {}
for name, importance in zip(features, clf2.feature_importances_):
    feature_check[name] = importance

In [45]:
pd.DataFrame.from_dict(feature_check, orient='index', columns=['feature_importance']).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_importance
action_type_5_8,0.115831
action_type_3_2,0.102624
time_length_10,0.031579
time_length_7,0.031324
action_type_5_2,0.030606
...,...
action_type_7_2,0.000000
hit_seq_1,0.000000
action_type_7_4,0.000000
hit_seq_6,0.000000


In [46]:
# dump(clf2, '1-3-DecisionTree.joblib')

### XGBOOST

In [47]:
clf3 = xgb.XGBClassifier(learning_rate = 0.05, n_estimators=300, max_depth=3, verbosity=2, random_state=0).fit(X_train, y_train)
y_pred = clf3.predict(X_test)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

[23:23:24] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[23:23:25] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[23:23:25] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[23:23:26] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[23:23:27] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[23:23:27] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[23:

In [48]:
scores['XGboost'] = [accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred)]

In [49]:
feature_check = {}
for name, importance in zip(features, clf3.feature_importances_):
    feature_check[name] = importance

In [50]:
pd.DataFrame.from_dict(feature_check, orient='index', columns=['feature_importance']).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_importance
dvc_ctg_nm_unknown_2,0.132382
action_type_3_2,0.068043
action_type_3_4,0.056334
dvc_ctg_nm_unknown_4,0.050930
action_type_5_9,0.040030
...,...
biz_unit_A02_2,0.000000
action_type_7_2,0.000000
action_type_7_6,0.000000
biz_unit_A02_6,0.000000


In [51]:
# dump(clf3, '1-3-xgb.joblib')

### Logistic Regression

In [52]:
clf4 = LogisticRegression(max_iter=1000, random_state=0).fit(X_train, y_train)
y_pred = clf4.predict(X_test)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

Accuracy: 0.63
F1 score: 0.65
Precision: 0.62
Recall: 0.68


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [53]:
scores['Logistic Regression'] = [accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred)]

In [54]:
# dump(clf4, '1-3-LogisticRegression.joblib')

### Linear SVM

In [55]:
clf5 = LinearSVC(random_state=0, num).fit(X_train, y_train)
y_pred = clf5.predict(X_test)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

Accuracy: 0.50
F1 score: 0.00
Precision: 0.00
Recall: 0.00




In [56]:
scores['Linear SVM'] = [accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred)]

In [57]:
# dump(clf4, '1-3-LinearSVM.joblib')

### MLP(DNN)

In [58]:
def models(train):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_shape=(train.shape[1],)))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr= 0.001, rho = 0.9), metrics=['acc', f1_m, precision_m, recall_m])
    return model

In [59]:
model = models(X_train)
model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                7712      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 8,257
Trainable params: 8,257
Non-trainable params: 0
_________________________________________________________________


In [60]:
history = model.fit(X_train, y_train, epochs=25, batch_size=1000, validation_data=(X_test, y_test), verbose=2, shuffle=True)


Train on 81713 samples, validate on 35021 samples
Epoch 1/25
 - 2s - loss: 4.4503 - acc: 0.4913 - f1_m: 0.4487 - precision_m: 0.4768 - recall_m: 0.5225 - val_loss: 2.4246 - val_acc: 0.5006 - val_f1_m: 0.6463 - val_precision_m: 0.4970 - val_recall_m: 0.9254
Epoch 2/25
 - 0s - loss: 2.2497 - acc: 0.5026 - f1_m: 0.4212 - precision_m: 0.4993 - recall_m: 0.5294 - val_loss: 2.1070 - val_acc: 0.5064 - val_f1_m: 0.6591 - val_precision_m: 0.4999 - val_recall_m: 0.9681
Epoch 3/25
 - 0s - loss: 1.9535 - acc: 0.5110 - f1_m: 0.4033 - precision_m: 0.5084 - recall_m: 0.5196 - val_loss: 1.5476 - val_acc: 0.5286 - val_f1_m: 0.6602 - val_precision_m: 0.5127 - val_recall_m: 0.9277
Epoch 4/25
 - 0s - loss: 1.7139 - acc: 0.5178 - f1_m: 0.4103 - precision_m: 0.5178 - recall_m: 0.5200 - val_loss: 1.3073 - val_acc: 0.5423 - val_f1_m: 0.6452 - val_precision_m: 0.5257 - val_recall_m: 0.8356
Epoch 5/25
 - 0s - loss: 1.5604 - acc: 0.5218 - f1_m: 0.4225 - precision_m: 0.5283 - recall_m: 0.5203 - val_loss: 1.2430 

In [61]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=1)



In [62]:
scores['DNN'] = [accuracy, f1_score, precision, recall]

In [63]:
print(loss)
print(accuracy)
print(f1_score)
print(precision)
print(recall)

0.7612859272016681
0.5455583930015564
0.304965078830719
0.6202800273895264
0.2115107774734497


In [64]:
# model.save('1-3-DNN.h5')

In [65]:
pd.DataFrame.from_dict(scores, orient='index', columns=['Accuracy', 'F1-Score', 'Precision', 'Recall'])

Unnamed: 0,Accuracy,F1-Score,Precision,Recall
Gaussian Naive Bayes,0.574884,0.651775,0.549062,0.801761
Decision Tree,0.715085,0.716212,0.708076,0.724537
XGboost,0.780103,0.738799,0.899711,0.626712
Logistic Regression,0.631621,0.646925,0.616826,0.680113
Linear SVM,0.503755,0.0,0.0,0.0
DNN,0.545558,0.304965,0.62028,0.211511
