In [34]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import datetime
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from matplotlib import pyplot as plt
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.layers import Masking
from tqdm import tqdm_notebook
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import RandomOverSampler
from keras.utils import to_categorical

In [35]:
data= pd.read_csv('preprocessed_data.csv', low_memory=False)

In [36]:
## 하루 최대 세션 길이 설정 (Padding)
check_max_session = 55

In [37]:
data['Time'] = pd.to_datetime(data['Time'])

In [38]:
data.head()

Unnamed: 0.1,Unnamed: 0,UID,URL,Site,Domain,Time,Title,keyword_p,keyword_t,category_1,category_2,category_3,category_4,category_5,category_6,ownership_1,ownership_2,PC,session_id
0,0,20120712210FAAE99B43,http://mw.cjmall.com/cjupload/cjmall/planshop/...,오트랜드,cjmall.com/,2014-06-01 14:07:26,CJmall,,,비즈니스/쇼핑,종합쇼핑몰,홈쇼핑,CJ오쇼핑,,,Retailer,홈쇼핑,0,0
1,1,20120712210FAAE99B43,http://page.uplus.co.kr/index.html?v=MjAxMzEwM...,LG유플러스,uplus.co.kr/,2014-06-01 14:25:31,U+Page,,,비즈니스/쇼핑,통신/네트워크,무선/이동통신,LG유플러스,,,Maker,무선/이동통신,0,1
2,2,20120712210FAAE99B43,http://page.uplus.co.kr/,U+Page 모바일,uplus.co.kr/,2014-06-01 14:25:34,http://page.uplus.co.kr/,,,비즈니스/쇼핑,통신/네트워크,무선/이동통신,LG유플러스,,,Maker,무선/이동통신,0,1
3,3,20120712210FAAE99B43,http://page.uplus.co.kr/index.html?v=MjAxMzEwM...,LG유플러스,uplus.co.kr/,2014-06-01 14:25:40,U+Page,,,비즈니스/쇼핑,통신/네트워크,무선/이동통신,LG유플러스,,,Maker,무선/이동통신,0,1
4,4,20120712210FAAE99B43,http://m.naver.com/,네이버 모바일,naver.com/,2014-06-01 14:25:46,NAVER,,,컴퓨터/인터넷,인터넷,포털,네이버,,,Portal,네이버,0,1


In [39]:
data.drop(['Unnamed: 0', 'Site', 'Title', 'keyword_t', 'keyword_p', 'category_3', 'category_4', 'category_5', 'category_6', 
           'ownership_1'], axis = 1, inplace = True)

In [40]:
data.head()

Unnamed: 0,UID,URL,Domain,Time,category_1,category_2,ownership_2,PC,session_id
0,20120712210FAAE99B43,http://mw.cjmall.com/cjupload/cjmall/planshop/...,cjmall.com/,2014-06-01 14:07:26,비즈니스/쇼핑,종합쇼핑몰,홈쇼핑,0,0
1,20120712210FAAE99B43,http://page.uplus.co.kr/index.html?v=MjAxMzEwM...,uplus.co.kr/,2014-06-01 14:25:31,비즈니스/쇼핑,통신/네트워크,무선/이동통신,0,1
2,20120712210FAAE99B43,http://page.uplus.co.kr/,uplus.co.kr/,2014-06-01 14:25:34,비즈니스/쇼핑,통신/네트워크,무선/이동통신,0,1
3,20120712210FAAE99B43,http://page.uplus.co.kr/index.html?v=MjAxMzEwM...,uplus.co.kr/,2014-06-01 14:25:40,비즈니스/쇼핑,통신/네트워크,무선/이동통신,0,1
4,20120712210FAAE99B43,http://m.naver.com/,naver.com/,2014-06-01 14:25:46,컴퓨터/인터넷,인터넷,네이버,0,1


### 파생변수 생성

In [41]:
#세션 시작시간과 끝시간 구하기
se_time_min = data.groupby(['UID', 'session_id'])['Time'].min()
se_time_max = data.groupby(['UID', 'session_id'])['Time'].max()

In [42]:
data = pd.merge(data,pd.DataFrame(se_time_max-se_time_min),
                        left_on = ['UID', 'session_id'], right_on = ['UID', 'session_id'])

In [43]:
#세션당 방문사이트 수
data = pd.merge(data,pd.DataFrame(data.groupby(['UID', 'session_id']).count()['Domain']),
                        left_on = ['UID', 'session_id'], right_on = ['UID', 'session_id'])

In [44]:
# 세션별 쇼핑 사이트 방문 빈도 (Ownership2 : 종합쇼핑몰, 소셜커머스, 지불/결제)
data['쇼핑사이트'] = data['ownership_2']

In [45]:
data['쇼핑사이트'] = data['쇼핑사이트'].apply(lambda x:1 if x in(['종합쇼핑몰', '소셜커머스', '지불/결제']) else 0)

### 구매 데이터 분석

In [46]:
purchase = pd.read_excel('140716_SSK 구매행태 조사 Raw Data_F.xlsx')

In [47]:
#가전제품 구입자(인터넷에서 정보를 얻은 사람들)
purchase_가전제품 = purchase[(purchase['A4_2_1'] == 3) & ((purchase['A4_5'] == 3) | (purchase['A4_5'] == 4))]
#패션의류/잡화 구입자(인터넷에서 정보를 얻은 사람들)
purchase_패션 = purchase[(purchase['A5_2_1'] == 3) & ((purchase['A5_5'] == 3) | (purchase['A5_5'] == 4))]
#화장품 구입자(인터넷에서 정보를 얻은 사람들)
purchase_화장품 = purchase[(purchase['A6_2_1'] == 3) & ((purchase['A6_5'] == 3) | (purchase['A6_5'] == 4))]
#186명이라 중순과 하순만 고려해야 할듯

In [48]:
print(purchase_가전제품.shape)
print(purchase_패션.shape)
print(purchase_화장품.shape)

(98, 151)
(190, 151)
(155, 151)


In [49]:
data = pd.merge(data,purchase[['UID','A4_2_1','A4_2_2', 'A5_2_1','A5_2_2' ,'A6_2_1','A6_2_2' ,'A4_5', 'A5_5', 'A6_5']],
                how = 'inner',left_on = 'UID', right_on= 'UID')

In [50]:
data.head()

Unnamed: 0,UID,URL,Domain_x,Time_x,category_1,category_2,ownership_2,PC,session_id,Time_y,...,쇼핑사이트,A4_2_1,A4_2_2,A5_2_1,A5_2_2,A6_2_1,A6_2_2,A4_5,A5_5,A6_5
0,20120712210FAAE99B43,http://mw.cjmall.com/cjupload/cjmall/planshop/...,cjmall.com/,2014-06-01 14:07:26,비즈니스/쇼핑,종합쇼핑몰,홈쇼핑,0,0,00:00:00,...,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0
1,20120712210FAAE99B43,http://page.uplus.co.kr/index.html?v=MjAxMzEwM...,uplus.co.kr/,2014-06-01 14:25:31,비즈니스/쇼핑,통신/네트워크,무선/이동통신,0,1,00:06:53,...,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0
2,20120712210FAAE99B43,http://page.uplus.co.kr/,uplus.co.kr/,2014-06-01 14:25:34,비즈니스/쇼핑,통신/네트워크,무선/이동통신,0,1,00:06:53,...,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0
3,20120712210FAAE99B43,http://page.uplus.co.kr/index.html?v=MjAxMzEwM...,uplus.co.kr/,2014-06-01 14:25:40,비즈니스/쇼핑,통신/네트워크,무선/이동통신,0,1,00:06:53,...,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0
4,20120712210FAAE99B43,http://m.naver.com/,naver.com/,2014-06-01 14:25:46,컴퓨터/인터넷,인터넷,네이버,0,1,00:06:53,...,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0


In [51]:
data.drop(['URL',  'category_2'], axis = 1, inplace = True)

In [52]:
data = data.fillna(0)

  """Entry point for launching an IPython kernel.


In [53]:
data['category_1'] = data['category_1'].apply(lambda x:'기타' if x == 0 else x)

In [54]:
#category_1 label encoding
encoder = LabelEncoder()
data['category_1'] = encoder.fit_transform(data['category_1'])

In [55]:
print(encoder.classes_)

['게임' '경제/재테크' '교육/학교' '기타' '뉴스/미디어' '문화/예술' '비즈니스/쇼핑' '생활/건강' '스포츠/레저'
 '여행/세계정보' '연예/오락' '인물' '정치/사회' '종교' '컴퓨터/인터넷' '학문/사전']


## 가전제품 구입자 분류(0: 미구입, 1:6월 중순 구입, 2: 6월 하순 구입)

In [56]:
data.shape

(34217873, 19)

In [57]:
data.isna().sum()

UID            0
Domain_x       0
Time_x         0
category_1     0
ownership_2    0
PC             0
session_id     0
Time_y         0
Domain_y       0
쇼핑사이트          0
A4_2_1         0
A4_2_2         0
A5_2_1         0
A5_2_2         0
A6_2_1         0
A6_2_2         0
A4_5           0
A5_5           0
A6_5           0
dtype: int64

In [58]:
data['가전제품구매'] = 0 

In [59]:
#6월 초순 구입자
data.loc[((data.A4_2_1==3) & (data.A4_2_2==1)), '가전제품구매'] = 1

#6월 중순 구입자
data.loc[((data.A4_2_1==3) & (data.A4_2_2==2)), '가전제품구매'] = 2

#6월 하순 구입자
data.loc[((data.A4_2_1==3) & (data.A4_2_2==3)), '가전제품구매'] = 3

In [60]:
data.rename(columns ={'Domain_y': '세션당_방문사이트수'}, inplace= True)

In [61]:
data.drop(['Domain_x', 'ownership_2'], axis =1, inplace = True)

## 패션, 화장품 구입자들

In [62]:
data['패션구매'] = 0
data['화장품구매'] = 0

In [63]:
#6월 초순 구입자
data.loc[((data.A5_2_1==3) & (data.A5_2_2==2)), '패션구매'] = 1

#6월 중순 구입자
data.loc[((data.A5_2_1==3) & (data.A5_2_2==2)), '패션구매'] = 2

#6월 하순 구입자
data.loc[((data.A5_2_1==3) & (data.A5_2_2==3)), '패션구매'] = 3

#6월 초순 구입자
data.loc[((data.A6_2_1==3) & (data.A6_2_2==2)), '화장품구매'] = 1

#6월 중순 구입자
data.loc[((data.A6_2_1==3) & (data.A6_2_2==2)), '화장품구매'] = 2

#6월 하순 구입자
data.loc[((data.A6_2_1==3) & (data.A6_2_2==3)), '화장품구매'] = 3

In [64]:
data.head()

Unnamed: 0,UID,Time_x,category_1,PC,session_id,Time_y,세션당_방문사이트수,쇼핑사이트,A4_2_1,A4_2_2,A5_2_1,A5_2_2,A6_2_1,A6_2_2,A4_5,A5_5,A6_5,가전제품구매,패션구매,화장품구매
0,20120712210FAAE99B43,2014-06-01 14:07:26,6,0,0,00:00:00,1,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0
1,20120712210FAAE99B43,2014-06-01 14:25:31,6,0,1,00:06:53,6,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0
2,20120712210FAAE99B43,2014-06-01 14:25:34,6,0,1,00:06:53,6,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0
3,20120712210FAAE99B43,2014-06-01 14:25:40,6,0,1,00:06:53,6,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0
4,20120712210FAAE99B43,2014-06-01 14:25:46,14,0,1,00:06:53,6,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0


In [65]:
data['Time_y'] = data['Time_y'].dt.total_seconds()

In [66]:
data.head()

Unnamed: 0,UID,Time_x,category_1,PC,session_id,Time_y,세션당_방문사이트수,쇼핑사이트,A4_2_1,A4_2_2,A5_2_1,A5_2_2,A6_2_1,A6_2_2,A4_5,A5_5,A6_5,가전제품구매,패션구매,화장품구매
0,20120712210FAAE99B43,2014-06-01 14:07:26,6,0,0,0.0,1,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0
1,20120712210FAAE99B43,2014-06-01 14:25:31,6,0,1,413.0,6,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0
2,20120712210FAAE99B43,2014-06-01 14:25:34,6,0,1,413.0,6,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0
3,20120712210FAAE99B43,2014-06-01 14:25:40,6,0,1,413.0,6,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0
4,20120712210FAAE99B43,2014-06-01 14:25:46,14,0,1,413.0,6,0,1.0,2.0,3.0,2.0,4.0,1.0,5.0,2.0,1.0,0,2,0


In [69]:
data_초순 = data[data['Time_x']<= '2014-06-10']
data_중순 = data[(data['Time_x']<= '2014-06-20') & (data['Time_x'] > '2014-06-10')]
data_하순 = data[(data['Time_x']<= '2014-06-30') & (data['Time_x'] > '2014-06-20')]

In [104]:
data_초순['가전제품구매'].apply(lambda x:1 if x == 1 else 0)

0           0
1           0
2           0
3           0
4           0
           ..
34202827    0
34202828    0
34202829    0
34202830    0
34202831    0
Name: 가전제품구매, Length: 11969188, dtype: int64

In [105]:
구매_list = ['가전제품구매', '패션구매', '화장품구매']
for i in 구매_list:
    data_초순[i] = data_초순[i].apply(lambda x:1 if x== 1 else 0)
for i in 구매_list:
    data_중순[i] = data_중순[i].apply(lambda x:1 if x== 2 else 0)
for i in 구매_list:
    data_하순[i] = data_하순[i].apply(lambda x:1 if x== 3 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [106]:
print(data.shape)
print(data_초순.shape)

(34217873, 20)
(11969188, 20)


### RNN을 위한 전처리

In [107]:
def make_XY(df, category):
    df = df.set_index('UID')
    X = df[['category_1','세션당_방문사이트수', 'Time_y', '쇼핑사이트']]
    Y = df[[category + '구매']]
    X = X.reset_index()
    X = pd.concat([X,pd.get_dummies(X['category_1'])], axis = 1)
    Y = Y.groupby(Y.index).first()
    new_Y = Y[category + '구매']
    return X,new_Y

In [108]:
X_초순_가전제품,Y_초순_가전제품 = make_XY(data_초순,'가전제품')
X_중순_가전제품,Y_중순_가전제품 = make_XY(data_중순,'가전제품')
X_하순_가전제품,Y_하순_가전제품 = make_XY(data_하순,'가전제품')

X_초순_패션,Y_초순_패션 = make_XY(data_초순,'패션')
X_중순_패션,Y_중순_패션 = make_XY(data_중순,'패션')
X_하순_패션,Y_하순_패션 = make_XY(data_하순,'패션')

X_초순_화장품,Y_초순_화장품 = make_XY(data_초순,'화장품')
X_중순_화장품,Y_중순_화장품 = make_XY(data_중순,'화장품')
X_하순_화장품,Y_하순_화장품 = make_XY(data_하순,'화장품')

In [109]:
X_초순_패션.head()

Unnamed: 0,UID,category_1,세션당_방문사이트수,Time_y,쇼핑사이트,0,1,2,3,4,...,6,7,8,9,10,11,12,13,14,15
0,20120712210FAAE99B43,6,1,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,20120712210FAAE99B43,6,6,413.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,20120712210FAAE99B43,6,6,413.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,20120712210FAAE99B43,6,6,413.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,20120712210FAAE99B43,14,6,413.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


#### 3차원 array로 만드는 과정

In [110]:
def make_3D_array(X):
    new_X = []
    for UID in tqdm_notebook(X['UID'].unique()):
        temp = X[X['UID'] == UID]
        new_X.append(temp.iloc[:,2:].values)
    new_X = np.asarray(new_X)
    return new_X

In [111]:
new_X_초순_화장품 = make_3D_array(X_초순_화장품)
new_X_중순_화장품 = make_3D_array(X_중순_화장품)
new_X_하순_화장품 = make_3D_array(X_하순_화장품)

In [112]:
new_X_초순_패션 = make_3D_array(X_초순_패션)
new_X_중순_패션 = make_3D_array(X_중순_패션)
new_X_하순_패션 = make_3D_array(X_하순_패션)

In [113]:
new_X_초순_가전제품 = make_3D_array(X_초순_가전제품)
new_X_중순_가전제품 = make_3D_array(X_중순_가전제품)
new_X_하순_가전제품 = make_3D_array(X_하순_가전제품)

### 3d화 과정이 오래걸리니 csv파일로 저장

In [114]:
pd.DataFrame(Y_초순_가전제품).to_csv('RecoSystem/data/Y_초순_가전제품.csv')
pd.DataFrame(Y_초순_패션).to_csv('RecoSystem/data/Y_초순_패션.csv')
pd.DataFrame(Y_초순_화장품).to_csv('RecoSystem/data/Y_초순_화장품.csv')

pd.DataFrame(Y_중순_가전제품).to_csv('RecoSystem/data/Y_중순_가전제품.csv')
pd.DataFrame(Y_중순_패션).to_csv('RecoSystem/data/Y_중순_패션.csv')
pd.DataFrame(Y_중순_화장품).to_csv('RecoSystem/data/Y_중순_화장품.csv')

pd.DataFrame(Y_하순_가전제품).to_csv('RecoSystem/data/Y_하순_가전제품.csv')
pd.DataFrame(Y_하순_패션).to_csv('RecoSystem/data/Y_하순_패션.csv')
pd.DataFrame(Y_하순_화장품).to_csv('RecoSystem/data/Y_하순_화장품.csv')

### Oversampling

In [125]:
X_중순_패션_padded.shape

(1047, 5000, 19)

In [124]:
X_초순_화장품_padded.shape

(1038, 5000, 19)

In [119]:
def make_padding_and_oversample(X, Y):
    max_len = 5000
    X_padding = sequence.pad_sequences(X, maxlen = max_len)
    X_padding2 = X_padding.reshape(X.shape[0], max_len* X_padding.shape[2])
    #smote = SMOTE(random_state=0)
    #X_resampled, Y_resampled = smote.fit_resample(X_padding2, Y)
    #X_resampled = X_resampled.reshape(X_resampled.shape[0], max_len, X_padding.shape[2])
    return X_padding#, X_resampled, Y_resampled

In [121]:
X_초순_가전제품_padded =make_padding_and_oversample(new_X_초순_가전제품, Y_초순_가전제품)
X_초순_패션_padded =make_padding_and_oversample(new_X_초순_패션, Y_초순_패션)
X_초순_화장품_padded =make_padding_and_oversample(new_X_초순_화장품, Y_초순_화장품)

X_중순_가전제품_padded =make_padding_and_oversample(new_X_중순_가전제품, Y_중순_가전제품)
X_중순_패션_padded =make_padding_and_oversample(new_X_중순_패션, Y_중순_패션)
X_중순_화장품_padded =make_padding_and_oversample(new_X_중순_화장품, Y_중순_화장품)

X_하순_가전제품_padded =make_padding_and_oversample(new_X_하순_가전제품, Y_하순_가전제품)
X_하순_패션_padded =make_padding_and_oversample(new_X_하순_패션, Y_하순_패션)
X_하순_화장품_padded =make_padding_and_oversample(new_X_하순_화장품, Y_하순_화장품)

In [127]:
df1 = pd.DataFrame(X_초순_패션_padded.reshape(-1, 5000*19))
df1.to_csv('RecoSystem/data/초순_패션_3d_array.csv', index=False)
df2 = pd.DataFrame(X_초순_가전제품_padded.reshape(-1, 5000*19))
df2.to_csv('RecoSystem/data/초순_가전제품_3d_array.csv', index=False)
df3 = pd.DataFrame(X_초순_화장품_padded.reshape(-1, 5000*19))
df3.to_csv('RecoSystem/data/초순_화장품_3d_array.csv', index=False)

df1 = pd.DataFrame(X_중순_패션_padded.reshape(-1, 5000*19))
df1.to_csv('RecoSystem/data/중순_패션_3d_array.csv', index=False)
df2 = pd.DataFrame(X_중순_가전제품_padded.reshape(-1, 5000*19))
df2.to_csv('RecoSystem/data/중순_가전제품_3d_array.csv', index=False)
df3 = pd.DataFrame(X_중순_화장품_padded.reshape(-1, 5000*19))
df3.to_csv('RecoSystem/data/중순_화장품_3d_array.csv', index=False)

df1 = pd.DataFrame(X_하순_패션_padded.reshape(-1, 5000*19))
df1.to_csv('RecoSystem/data/하순_패션_3d_array.csv', index=False)
df2 = pd.DataFrame(X_하순_가전제품_padded.reshape(-1, 5000*19))
df2.to_csv('RecoSystem/data/하순_가전제품_3d_array.csv', index=False)
df3 = pd.DataFrame(X_하순_화장품_padded.reshape(-1, 5000*19))
df3.to_csv('RecoSystem/data/하순_화장품_3d_array.csv', index=False)

## 여기까지 전처리 과정