**현재 세션(1개)의 모든 클릭 로그**를 대상으로 MLP, Gaussian Naive Bayes, Decision Tree, XGBoost, Logistic Regression, Linear SVM을 사용해서 구매 예측

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import itertools
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# <br>
# 전처리

In [2]:
온라인 = pd.read_csv('제6회 L.POINT Big Data Competition-분석용데이터-01.온라인 행동 정보.csv', encoding='utf-8')
온라인 = 온라인.sort_values(['clnt_id','sess_id','hit_seq']).reset_index(drop=True)
온라인.shape

  interactivity=interactivity, compiler=compiler, result=result)


(3196362, 14)

In [3]:
# {clnt_id}_{sess_id}인 unique_id 생성
온라인['unique_id'] = list(map(lambda x,y: str(x)+'_'+str(y), 온라인.clnt_id, 온라인.sess_id))
온라인.head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,unique_id
0,1,1,1,0,A01,20190911,16:14,11880,,과일선물세트,14.0,124.0,unknown,mobile_app,1_1
1,1,1,2,0,A01,20190911,16:15,22432,,과일선물세트 백화점,14.0,124.0,unknown,mobile_app,1_1
2,1,1,3,0,A01,20190911,16:15,36140,,과일바구니,14.0,124.0,unknown,mobile_app,1_1
3,1,2,1,0,A01,20190922,14:09,41584,,초등가을잠바,45.0,424.0,unknown,mobile_app,1_2
4,1,2,2,0,A01,20190922,14:10,56113,,초등가을점퍼,45.0,424.0,unknown,mobile_app,1_2


In [4]:
# 온라인 고객: 72,399명, 세션: 367,149개
온라인.clnt_id.nunique(), 온라인.unique_id.nunique()

(72399, 367149)

In [10]:
# 각 clnt_id별 session이 바뀌는 지점 index 저장
idx1 = 온라인.unique_id.drop_duplicates().index.tolist()
idx2 = idx1[1:] + [len(온라인)]

In [10]:
# session별 체류시간 = 다음 hit_pss_tm - 이전 hit_pss_tm
stay_tm = []
for i, j in tqdm_notebook(zip(idx1, idx2), total=len(idx1)):
    temp = 온라인.iloc[i:j].hit_pss_tm
    if len(temp) == 1:
        stay_tm.append(temp.tolist())
    else:
        temp = temp.diff().tolist()[1:]
        temp.append(temp[-1]) # 마지막은 이전 세션의 체류시간을 활용
        stay_tm.append(temp)
    #stay_tm = stay_tm + 온라인.iloc[i:j].hit_pss_tm.diff().fillna(0).tolist()

HBox(children=(IntProgress(value=0, max=367149), HTML(value='')))




In [14]:
# list를 flatten
# stay_tm = sum(stay_tm, [])
온라인['time_length'] = list(itertools.chain.from_iterable(stay_tm))

In [16]:
# 밀리초 -> 초 변환
온라인['hit_pss_tm'] = np.ceil(온라인['hit_pss_tm']/1000)
온라인['time_length'] = np.ceil(온라인['time_length']/1000)

In [17]:
# 주말 및 공휴일 여부
holiday_diff = []
for i in 온라인.sess_dt.map(lambda x: str(x)[-3:]).tolist():
    if i in (['706','707','713','714','717','720','721','727','728',
               '803','804','810','811','815','817','818','824','825',
               '907','908','912','913','914','915','921','922','928','929']):
        holiday_diff.append(1)
    else:
        holiday_diff.append(0)
온라인['holiday_diff'] = holiday_diff

In [19]:
# keyword: session별 sech_kwd의 cumulative sum
온라인['sech_kwd'] = 온라인.sech_kwd.map(lambda x: 0 if str(x) == 'nan' else 1)
keyword = []
for i, j in tqdm_notebook(zip(idx1, idx2), total=len(idx1)):
    temp = 온라인.iloc[i:j].sech_kwd
    keyword.append(np.cumsum(temp.tolist()))
온라인['keyword'] = list(itertools.chain.from_iterable(keyword))

HBox(children=(IntProgress(value=0, max=367149), HTML(value='')))




In [20]:
온라인['sess_dt'] = 온라인['sess_dt'].map(lambda x: str(x)[:-2]) # day제거하고 month(7,8,9월)만 남김
온라인['dvc_ctg_nm'].fillna('unknown',inplace=True) # device null값을 unknown으로 대체

In [21]:
# dummy variable 생성 - action_type, biz_unit, sess_dt, trfc_src, dvc_ctg_nm
온라인 = pd.concat([온라인, pd.get_dummies(온라인[['biz_unit','sess_dt','trfc_src','dvc_ctg_nm']], drop_first=True)], axis=1)
온라인 = pd.concat([온라인, pd.get_dummies(온라인.action_type, drop_first=True, prefix='action_type')], axis=1)
온라인 = 온라인.drop(['action_type','biz_unit','sess_dt','hit_tm','sech_kwd','tot_pag_view_ct', 'tot_sess_hr_v','trfc_src','dvc_ctg_nm'], axis=1)

In [23]:
온라인 = 온라인[['clnt_id', 'sess_id', 'trans_id', 'hit_seq', 'hit_pss_tm', 
           'time_length', 'holiday_diff', 'keyword', 'action_type_1', 'action_type_2',
           'action_type_3','action_type_4', 'action_type_5', 'action_type_6', 'action_type_7',
           'biz_unit_A02', 'biz_unit_A03', 'sess_dt_201908', 'sess_dt_201909', 'trfc_src_PORTAL_1',
           'trfc_src_PORTAL_2', 'trfc_src_PORTAL_3', 'trfc_src_PUSH', 'trfc_src_WEBSITE', 'trfc_src_unknown', 
           'dvc_ctg_nm_mobile_app','dvc_ctg_nm_mobile_web', 'dvc_ctg_nm_unknown', 'unique_id']]
print(온라인.shape)
온라인.head()

(3196362, 29)


Unnamed: 0,clnt_id,sess_id,trans_id,hit_seq,hit_pss_tm,time_length,holiday_diff,keyword,action_type_1,action_type_2,...,trfc_src_PORTAL_1,trfc_src_PORTAL_2,trfc_src_PORTAL_3,trfc_src_PUSH,trfc_src_WEBSITE,trfc_src_unknown,dvc_ctg_nm_mobile_app,dvc_ctg_nm_mobile_web,dvc_ctg_nm_unknown,unique_id
0,1,1,,1,12.0,11.0,0,1,0,0,...,0,0,0,0,0,1,1,0,0,1_1
1,1,1,,2,23.0,14.0,0,2,0,0,...,0,0,0,0,0,1,1,0,0,1_1
2,1,1,,3,37.0,14.0,0,3,0,0,...,0,0,0,0,0,1,1,0,0,1_1
3,1,2,,1,42.0,15.0,1,1,0,0,...,0,0,0,0,0,1,1,0,0,1_2
4,1,2,,2,57.0,14.0,1,2,0,0,...,0,0,0,0,0,1,1,0,0,1_2


In [24]:
# 저장
온라인.to_csv('온라인_전처리_final.csv', index=False)

In [2]:
# 불러오기
import pandas as pd

온라인 = pd.read_csv('온라인_전처리_final.csv')

# <br>
# 종속변수 생성
다음 세션의 구매 여부

In [3]:
# 각 hit_seq 당 action_type이 구매완료=1, 이외=0
온라인['buy'] = 온라인['action_type_6']
온라인.drop('action_type_6', inplace=True, axis=1)
print(온라인.shape)
온라인.head()

(3196362, 29)


Unnamed: 0,clnt_id,sess_id,trans_id,hit_seq,hit_pss_tm,time_length,holiday_diff,keyword,action_type_1,action_type_2,...,trfc_src_PORTAL_2,trfc_src_PORTAL_3,trfc_src_PUSH,trfc_src_WEBSITE,trfc_src_unknown,dvc_ctg_nm_mobile_app,dvc_ctg_nm_mobile_web,dvc_ctg_nm_unknown,unique_id,buy
0,1,1,,1,12.0,11.0,0,1,0,0,...,0,0,0,0,1,1,0,0,1_1,0
1,1,1,,2,23.0,14.0,0,2,0,0,...,0,0,0,0,1,1,0,0,1_1,0
2,1,1,,3,37.0,14.0,0,3,0,0,...,0,0,0,0,1,1,0,0,1_1,0
3,1,2,,1,42.0,15.0,1,1,0,0,...,0,0,0,0,1,1,0,0,1_2,0
4,1,2,,2,57.0,14.0,1,2,0,0,...,0,0,0,0,1,1,0,0,1_2,0


In [4]:
#다음 세션의 구매를 예측하기 위한 종속변수 생성
구매여부 = 온라인[['clnt_id', 'sess_id', 'buy']].groupby(['clnt_id', 'sess_id']).sum()
구매여부.buy = 구매여부.buy.apply(lambda x:0 if x == 0 else 1)
구매여부 = 구매여부.sort_index()
구매여부 = 구매여부.reset_index()
#각 clnt_id별로 shift로 1행씩 올림
구매여부.buy = 구매여부.buy.shift(-1)
g = 구매여부.groupby('clnt_id')
#1행씩 올리면 각 clnt_id별로 마지막 값은 다음 id의 값, 따라서 마지막 행 제거
구매여부.drop(g.tail(1).index, axis=0, inplace = True)

In [5]:
구매여부

Unnamed: 0,clnt_id,sess_id,buy
0,1,1,0.0
2,2,1,0.0
3,2,2,0.0
4,2,3,0.0
5,2,4,0.0
...,...,...,...
367141,72427,3,0.0
367143,72428,1,0.0
367144,72428,2,0.0
367145,72428,3,0.0


In [6]:
온라인.drop(['buy'], axis =1, inplace= True)
온라인 = pd.merge(온라인, 구매여부, left_on=['clnt_id', 'sess_id'], right_on=['clnt_id', 'sess_id'])

In [7]:
print(온라인.shape)
온라인.head()

(2750164, 29)


Unnamed: 0,clnt_id,sess_id,trans_id,hit_seq,hit_pss_tm,time_length,holiday_diff,keyword,action_type_1,action_type_2,...,trfc_src_PORTAL_2,trfc_src_PORTAL_3,trfc_src_PUSH,trfc_src_WEBSITE,trfc_src_unknown,dvc_ctg_nm_mobile_app,dvc_ctg_nm_mobile_web,dvc_ctg_nm_unknown,unique_id,buy
0,1,1,,1,12.0,11.0,0,1,0,0,...,0,0,0,0,1,1,0,0,1_1,0.0
1,1,1,,2,23.0,14.0,0,2,0,0,...,0,0,0,0,1,1,0,0,1_1,0.0
2,1,1,,3,37.0,14.0,0,3,0,0,...,0,0,0,0,1,1,0,0,1_1,0.0
3,2,1,,1,40.0,8.0,1,1,0,0,...,0,0,0,0,0,0,0,1,2_1,0.0
4,2,1,,2,48.0,1.0,1,1,1,0,...,0,0,0,0,0,0,0,1,2_1,0.0


# <br>
# [1-3]현재 세션(1개)의 앞 부분의 10개 클릭 로그 사용
(session, sequence, variables) 3d array를 1d array로 변환하여 활용

In [8]:
from keras.preprocessing import sequence
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [11]:
# (session, sequence, variables) 3d array 변환
온라인_x = []
for i, j in tqdm_notebook(zip(idx1, idx2), total=len(idx1)):
    온라인_x.append(온라인.iloc[i:j, 3:-2].values)
    
#scaler = StandardScaler() 
#scaler = MinMaxScaler(feature_range=(0, 1))
#온라인_x = scaler.fit_transform(np.array(온라인_x))
온라인_x = np.array(온라인_x)

HBox(children=(IntProgress(value=0, max=294750), HTML(value='')))




In [12]:
# session 당 구매 여부
온라인_y = []
for i,j in tqdm_notebook(zip(idx1,idx2), total=len(idx1)):
    온라인_y.append([int(온라인.buy.iloc[i:j].sum()>0)])

HBox(children=(IntProgress(value=0, max=294750), HTML(value='')))




In [13]:
len(온라인_y)

294750

In [14]:
def make_padding_and_oversample(X, Y, length=70):
    max_len = length
    X_padding = sequence.pad_sequences(X, maxlen=max_len, padding='pre', truncating='post')
    X_padding2 = X_padding.reshape(X.shape[0], max_len* X_padding.shape[2])

    smote = SMOTE(random_state=0)
    X_resampled, Y_resampled = smote.fit_resample(X_padding2, Y)
    X_resampled = X_resampled.reshape(X_resampled.shape[0], max_len, X_padding.shape[2])
    return X_padding, X_resampled, Y_resampled

In [15]:
X_padded, X_resampled, Y_resampled = make_padding_and_oversample(온라인_x, 온라인_y, length=10)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, Y_resampled, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

(361313, 10, 24)
(154849, 10, 24)


In [17]:
num_train = X_train.shape[0]
num_test = X_test.shape[0]

In [18]:
X_train_1d = X_train.reshape((num_train,-1))
X_test_1d = X_test.reshape((num_test,-1))

# <br>
# 모델링

In [41]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import xgboost as xgb

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.layers import Dropout

import pickle
from joblib import dump, load

In [20]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Gaussian Naive Bayes

In [21]:
clf = GaussianNB()
clf.fit(X_train_1d, y_train)
y_pred = clf.predict(X_test_1d)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

Accuracy: 0.62
F1 score: 0.70
Precision: 0.58
Recall: 0.88


In [38]:
dump(clf, '1-3-Gaussian.joblib')

['1-3-Gaussian.joblib']

### Decision Tree

In [22]:
clf2 = DecisionTreeClassifier().fit(X_train_1d, y_train)
y_pred = clf2.predict(X_test_1d)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

Accuracy: 0.82
F1 score: 0.82
Precision: 0.82
Recall: 0.82


In [37]:
dump(clf2, '1-3-DecisionTree.joblib')

['1-3-DecisionTree.joblib']

### XGBOOST

In [42]:
clf3 = xgb.XGBClassifier(learning_rate = 0.05, n_estimators=300, max_depth=5).fit(X_train_1d, y_train)
y_pred = clf3.predict(X_test_1d)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

Accuracy: 0.84
F1 score: 0.83
Precision: 0.91
Recall: 0.75


In [43]:
dump(clf3, '1-3-xgb.joblib')

['1-3-xgb.joblib']

### Logistic Regression

In [26]:
clf4 = LogisticRegression().fit(X_train_1d, y_train)
y_pred = clf4.predict(X_test_1d)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.65
F1 score: 0.58
Precision: 0.73
Recall: 0.49


In [36]:
dump(clf4, '1-3-LogisticRegression.joblib')

['1-3-LogisticRegression.joblib']

### Linear SVM

In [27]:
clf5 = LinearSVC().fit(X_train_1d, y_train)
y_pred = clf5.predict(X_test_1d)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('F1 score: %.2f' % f1_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))



Accuracy: 0.72
F1 score: 0.61
Precision: 0.97
Recall: 0.44


In [35]:
dump(clf4, '1-3-LinearSVM.joblib')

['1-3-LinearSVM.joblib']

### MLP(DNN)

In [28]:
X_train_1d.shape

(361313, 240)

In [29]:
def models(train):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_shape=(train.shape[1],)))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr= 0.001, rho = 0.9), metrics=['acc', f1_m, precision_m, recall_m])
    return model

In [30]:
model = models(X_train_1d)
model.summary()






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                7712      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 8,257
Trainable params: 8,257
Non-trainable params: 0
_________________________________________________________________


In [31]:
history = model.fit(X_train_1d, y_train, epochs=25, batch_size=1000, validation_data=(X_test_1d, y_test), verbose=2, shuffle=True)




Train on 361313 samples, validate on 154849 samples
Epoch 1/25





 - 3s - loss: 1.2554 - acc: 0.6386 - f1_m: 0.5753 - precision_m: 0.7139 - recall_m: 0.5375 - val_loss: 0.9449 - val_acc: 0.6957 - val_f1_m: 0.6998 - val_precision_m: 0.6906 - val_recall_m: 0.7098
Epoch 2/25
 - 2s - loss: 0.8269 - acc: 0.7005 - f1_m: 0.6595 - precision_m: 0.7816 - recall_m: 0.6138 - val_loss: 0.6803 - val_acc: 0.7223 - val_f1_m: 0.6518 - val_precision_m: 0.8737 - val_recall_m: 0.5201
Epoch 3/25
 - 2s - loss: 0.6715 - acc: 0.7354 - f1_m: 0.7108 - precision_m: 0.7996 - recall_m: 0.6684 - val_loss: 0.9134 - val_acc: 0.7005 - val_f1_m: 0.7331 - val_precision_m: 0.6613 - val_recall_m: 0.8228
Epoch 4/25
 - 2s - loss: 0.5978 - acc: 0.7548 - f1_m: 0.7362 - precision_m: 0.8133 - recall_m: 0.6969 - val_loss: 0.5363 - val_acc: 0.7721 - val_f1_m: 0.7727 - val_precision_m: 0.7706 - val_recall_m: 0.7753
Epoch 5/25
 - 2s - loss: 0.5422 - acc: 0.7705 - f1_m: 0.7539 - precision_m: 0.8269 - recall_m: 0.7128 - val_loss

In [32]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test_1d, y_test, verbose=1)



In [33]:
print(loss)
print(accuracy)
print(f1_score)
print(precision)
print(recall)

0.36411874425146473
0.8382682484226569
0.8188735674313347
0.9045048350632675
0.7566967988074423


In [34]:
model.save('1-3-DNN.h5')