# Graph Convolution Network 요약

- 상품 체계 정보를 label로 이용
- 배송2.0 으로만 진행
- (***) 중요한 step이 추가되었다 : 그래프 생성시 interaction이 없는 node들을 자기 자신과의 interaction으로 선언해주었다
- 2가지 ver이 존재(여기선 일반적인 경우로 진행)
> - 상품들의 graph-edge간 다른 카테고리와의 작용에는 weight를 조절하는 장치의 유무

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import glob
import os
import pickle

%matplotlib inline

In [2]:
from sklearn import cluster, metrics

In [5]:
import pandas as pd
import glob
import os

In [3]:
import networkx as nx
from collections import Counter

In [4]:
pwd

'/home/wmind2/myeonggyulee/SNU_project'

---

### # 상품 체계( 상품 + 카테고리 정보) 
- 저장한거 불러온다! (prod_info.csv)

In [5]:
prod_cate = pd.read_csv('result_data/prod_info.csv', index_col = 0)

  mask |= (ar1 == a)


In [6]:
prod_cate.head()

Unnamed: 0,pid,dcate_cd,scate_cd,mcate_cd,lcate_cd,dcate_nm,scate_nm,mcate_nm,lcate_nm
0,194782399,4103733.0,3100659.0,2100110.0,1100019.0,반상기/식기 홈세트,식기,테이블웨어/식기,주방용품
1,228663807,4102667.0,3100485.0,2100083.0,1100014.0,이외김치,김치,김치/반찬,신선식품
2,320111031,4102071.0,3100386.0,2100064.0,1100011.0,무지/민무늬 셔츠,기본/무지 셔츠,블라우스/셔츠,브랜드 여성의류
3,162068532,4100445.0,3100082.0,2100490.0,1100091.0,싱글(소형)-1인용,전기장판,전기매트/요,계절가전
4,189956810,4104596.0,3100350.0,2100148.0,1100046.0,장지갑,남성 지갑,지갑/벨트,가방/잡화


In [7]:
prod_cate.nunique()

pid         5591911
dcate_cd       6929
scate_cd       1947
mcate_cd        451
lcate_cd         78
dcate_nm       5119
scate_nm       1494
mcate_nm        367
lcate_nm         78
dtype: int64

In [8]:
prod_cate.shape

(5591911, 9)

In [9]:
prod_cate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5591911 entries, 0 to 5591915
Data columns (total 9 columns):
pid         int64
dcate_cd    float64
scate_cd    float64
mcate_cd    float64
lcate_cd    float64
dcate_nm    object
scate_nm    object
mcate_nm    object
lcate_nm    object
dtypes: float64(4), int64(1), object(4)
memory usage: 426.6+ MB


---
---

## # 세션 데이터 load & Graph 생성
- (클릭 데이터의! - 구매 상품과의 손실율을 확인해야한다! )prod_info : 상품 + 카테고리정보
- tmp_sess_1206 : 11월 26일부터 이전 6주의 멤버별 클릭 session 데이터
- dlv_cat : 상품 카테고리 정보


In [None]:
# 만들어진 세션 불러오기
with open('result_data/tmp_sess_1206', 'rb') as f:
    tmp_sess = pickle.load(f)

In [19]:
len(tmp_sess) # 6주치

60746812

In [None]:
len(tmp_sess) # 4일

4613890

In [None]:
graph_list = []
for k in range(len(tmp_sess)):
    for i in range(len(tmp_sess[k])-1):
        graph_list.append( (tmp_sess[k][i], tmp_sess[k][i+1]) )

In [58]:
len(graph_list) # 6주

151916990

In [None]:
len(graph_list)

6635558



---





### - weight 카운트 & 그래프 생성
- (***) 중요한 step이 추가되었다 : 그래프 생성시 interaction이 없는 node들을 자기자신과의 interaction으로 선언해주었다

In [83]:
"""
이런식으로 가자
x = [1,1,2,3]
myset = set(x)
list(myset)
"""
x = [1,1,2,3]
print( list(set(x)) )

[1, 2, 3]


In [11]:
# 만들어진 그래프 불러오기
with open('result_data/graph_list_1206', 'rb') as f:
    graph_list = pickle.load(f)

In [12]:
len(graph_list)

68497234

In [13]:
graph_list[:5]

[(602610157, 600515938, {'weight': 1}),
 (600515938, 600185944, {'weight': 6}),
 (600185944, 601529111, {'weight': 1}),
 (601529111, 600016493, {'weight': 1}),
 (600016493, 601333401, {'weight': 1})]

---

- 연습

In [58]:
x = graph_list[:5].copy()

In [60]:
tmp_list = []
tmp_new = [tmp_list.extend(list(item[:2])) for item in x]

In [61]:
tmp_list

[602610157,
 600515938,
 600515938,
 600185944,
 600185944,
 601529111,
 601529111,
 600016493,
 600016493,
 601333401]

In [62]:
tmp_new # 이렇게되겠지!? - 이해 잘 하기!

[None, None, None, None, None]

---

### 방법2로 가자!

##### 방법 1

In [63]:
%%time
## idea : graph_list에 들어가있는 vertex들은 node로서 선언될 수 있어 - 이 애들을 모아서 set을 만들고(distinct하게 추려주기위해), 없는 애들을 골라낼 수 있도록 작업하자!
tmp_list = []
tmp_new = [tmp_list.extend(list(item[:2])) for item in graph_list] # 실제로 tmp_new에 할당된 애들은 없을거야!

CPU times: user 19.1 s, sys: 843 ms, total: 19.9 s
Wall time: 19.9 s


In [80]:
len(tmp_list)

136994468

##### 방법 2

In [76]:
%%time
## idea : graph_list에 들어가있는 vertex들은 node로서 선언될 수 있어 - 이 애들을 모아서 set을 만들고(distinct하게 추려주기위해), 없는 애들을 골라낼 수 있도록 작업하자!
tmp_list_v2 = []
for _, edge in enumerate(graph_list[:5]):
    print(list(edge[:2]))
    #print(edge[0], edge[1], edge[2]['weight'])

[602610157, 600515938]
[600515938, 600185944]
[600185944, 601529111]
[601529111, 600016493]
[600016493, 601333401]
CPU times: user 547 µs, sys: 0 ns, total: 547 µs
Wall time: 375 µs


In [15]:
%%time
## idea : graph_list에 들어가있는 vertex들은 node로서 선언될 수 있어 - 이 애들을 모아서 set을 만들고(distinct하게 추려주기위해), 없는 애들을 골라낼 수 있도록 작업하자!
tmp_list_v2 = []
for _, edge in enumerate(graph_list):
    #print(list(edge[:2]))
    tmp_list_v2.extend( list(edge[:2]) )

CPU times: user 26.2 s, sys: 1.17 s, total: 27.4 s
Wall time: 27.4 s


In [79]:
len(tmp_list_v2)

136994468

---

### # not_used_nodes 선언
- used_graph_nodes이용

In [16]:
### 방법 2로 가자(안전하게)
used_graph_nodes = tmp_list_v2
used_graph_nodes = list( set(used_graph_nodes) )

In [17]:
len( used_graph_nodes ) # len(G.nodes) = 5148349와 맞다

5148349

In [90]:
x = [(1,2,{'weight':1})]

In [91]:
x

[(1, 2, {'weight': 1})]

In [92]:
x.append((1,3,{'weight':1}))

In [93]:
x

[(1, 2, {'weight': 1}), (1, 3, {'weight': 1})]

In [96]:
graph_list[:5]

[(602610157, 600515938, {'weight': 1}),
 (600515938, 600185944, {'weight': 6}),
 (600185944, 601529111, {'weight': 1}),
 (601529111, 600016493, {'weight': 1}),
 (600016493, 601333401, {'weight': 1})]

In [18]:
# not_used_nodes 선언

not_used_nodes = prod_cate[~prod_cate.pid.isin(used_graph_nodes)].pid.unique().tolist()

In [19]:
len(not_used_nodes)

443562

In [20]:
not_used_nodes[:5]

[161585551, 283309162, 273754259, 317813144, 301904877]

In [21]:
%%time
# not_used_nodes 를 선언해서, 기존의 graph_list에 추가시킨다!
not_used_nodes = prod_cate[~prod_cate.pid.isin(used_graph_nodes)].pid.unique().tolist()

for node in not_used_nodes:
    graph_list.append( (node, node, {'weight':1}) )

CPU times: user 25.2 s, sys: 1.49 s, total: 26.7 s
Wall time: 7.52 s


In [22]:
len(graph_list) # 기존 68497234

68940796

In [23]:
graph_list[-5:] # 이렇게 잘 추가!?

[(146583113, 146583113, {'weight': 1}),
 (319525783, 319525783, {'weight': 1}),
 (125568429, 125568429, {'weight': 1}),
 (102097148, 102097148, {'weight': 1}),
 (100156718, 100156718, {'weight': 1})]

----
---

# 정의한 graph 사용!

#### networkx 너무 많은 메모리 사용하는 문제 : https://stackoverflow.com/questions/35437439/why-is-networkx-consuming-all-my-memory

In [24]:
%%time
### Undirected
G = nx.DiGraph(graph_list)
G = G.to_undirected( ) # reciprocal : 양방향에 다 존재한경우만 edge로 표현(default는 False)

CPU times: user 14min 30s, sys: 1min 59s, total: 16min 29s
Wall time: 16min 29s


In [25]:
# not_used_nodes : (146583113, 146583113, {'weight': 1}) 확인해보자
G.get_edge_data(146583113,146583113,default=0)

{'weight': 1}

---

#### 그래프 저장 / 불러오기

In [26]:
# 저장
nx.write_gpickle(G, "result_data/graph_1209.gpickle")

In [10]:
# 불러오기
G = nx.read_gpickle("result_data/graph_1209.gpickle")

In [11]:
len(G.nodes) # (graph 수정하고서)559만개의 상품 다 존재하는지 확인!

5591911

In [25]:
len(G.nodes) #(graph 수정 전)

5148349

In [1]:
5591911 - 5148349

443562

In [None]:
len(G.nodes) # (4일동안의 규모)

1106646



---



---



## - (*)nodes 들의 카테고리 속성 정의해주자
- X_features, labels 로 쓰이는 카테고리 속성값들

In [12]:
### Undirected

undi_nodes = list(G.nodes)
#digraph_nodes = list(G_d.nodes)

In [13]:
undi_nodes[:5]

[602610157, 600515938, 600185944, 601529111, 600016493]

In [14]:
prod_cate.head()

Unnamed: 0,pid,dcate_cd,scate_cd,mcate_cd,lcate_cd,dcate_nm,scate_nm,mcate_nm,lcate_nm
0,194782399,4103733.0,3100659.0,2100110.0,1100019.0,반상기/식기 홈세트,식기,테이블웨어/식기,주방용품
1,228663807,4102667.0,3100485.0,2100083.0,1100014.0,이외김치,김치,김치/반찬,신선식품
2,320111031,4102071.0,3100386.0,2100064.0,1100011.0,무지/민무늬 셔츠,기본/무지 셔츠,블라우스/셔츠,브랜드 여성의류
3,162068532,4100445.0,3100082.0,2100490.0,1100091.0,싱글(소형)-1인용,전기장판,전기매트/요,계절가전
4,189956810,4104596.0,3100350.0,2100148.0,1100046.0,장지갑,남성 지갑,지갑/벨트,가방/잡화


In [15]:
prod_cate.info() # 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5591911 entries, 0 to 5591915
Data columns (total 9 columns):
pid         int64
dcate_cd    float64
scate_cd    float64
mcate_cd    float64
lcate_cd    float64
dcate_nm    object
scate_nm    object
mcate_nm    object
lcate_nm    object
dtypes: float64(4), int64(1), object(4)
memory usage: 426.6+ MB


In [16]:
prod_cate.isnull().sum(axis=0)

pid         0
dcate_cd    1
scate_cd    1
mcate_cd    1
lcate_cd    1
dcate_nm    1
scate_nm    1
mcate_nm    1
lcate_nm    1
dtype: int64

In [17]:
# G(그래프) 노드들을 순서로해서,    카테고리 정보 테이블 붙여주자! prod_cate[['dcate_cd', 'scate_cd', 'mcate_cd', 'lcate_cd']]
# undi_nodes + 'dcate'
undi_nodes_w_cate = pd.DataFrame({'pid':undi_nodes}).merge(prod_cate[['pid' , 'dcate_cd', 'scate_cd', 'mcate_cd', 'lcate_cd']], how='left').reset_index(drop=True)

In [18]:
undi_nodes_w_cate # 맞다

Unnamed: 0,pid,dcate_cd,scate_cd,mcate_cd,lcate_cd
0,602610157,4100879.0,3100174.0,2100020.0,1100004.0
1,600515938,4100077.0,3100012.0,2100003.0,1100000.0
2,600185944,4100042.0,3100007.0,2100002.0,1100000.0
3,601529111,4105068.0,3102133.0,2100425.0,1100059.0
4,600016493,4102692.0,3100489.0,2100084.0,1100014.0
...,...,...,...,...,...
5591906,146583113,4101359.0,3100258.0,2100504.0,1100006.0
5591907,319525783,4101359.0,3100258.0,2100504.0,1100006.0
5591908,125568429,4101359.0,3100258.0,2100504.0,1100006.0
5591909,102097148,4101359.0,3100258.0,2100504.0,1100006.0


In [31]:
undi_nodes_w_cate.isnull().sum(axis=0) # null 값이 있다 -> float형으로 들어간 이유

pid         0
dcate_cd    1
scate_cd    1
mcate_cd    1
lcate_cd    1
dtype: int64

In [19]:
# 아까 처음에 카테고리 정보 없는 상품 하나 있었지? 
# 얘는 나중에 처리하자 

undi_nodes_w_cate[undi_nodes_w_cate.pid == 10000033] 

Unnamed: 0,pid,dcate_cd,scate_cd,mcate_cd,lcate_cd
1839763,10000033,,,,


---
---

#### eg. 이런식으로 해보면?
- 흐름 예제

In [84]:
tmp = pd.DataFrame({'x':[3], 'y':[1]})

In [85]:
tmp

Unnamed: 0,x,y
0,3,1


In [86]:
# 이렇게 None 인 상황에 처리해주는거야
tmp.at[0,'x']= None

In [87]:
tmp

Unnamed: 0,x,y
0,,1


In [88]:
try :
    get_null_index = tmp[tmp.x.isnull()].index.values # x-column을 특정지어서 null 값 있느냐 물어본거야
    for null_index in get_null_index:
        tmp.at[ null_index , 'x' ] = int(0)
    tmp = tmp.apply(lambda x:x.astype(int))
except:
    pass

In [89]:
tmp

Unnamed: 0,x,y
0,0,1


In [90]:
tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
x    1 non-null int64
y    1 non-null int64
dtypes: int64(2)
memory usage: 144.0 bytes


----
---

## - 위의 경우처럼 해보자

In [20]:
# 일단 이런경우에 category null 값 채워줘

try :
    get_null_index = undi_nodes_w_cate[undi_nodes_w_cate.dcate_cd.isnull()].index.values # dcate을 특정지어서 null 값 있느냐 물어본거야 -> index 저장
    for null_index in get_null_index:
        undi_nodes_w_cate.at[ null_index , 'dcate_cd' ] = -9999
        undi_nodes_w_cate.at[ null_index , 'scate_cd' ] = -9999
        undi_nodes_w_cate.at[ null_index , 'mcate_cd' ] = -9999
        undi_nodes_w_cate.at[ null_index , 'lcate_cd' ] = -9999
    
    #모든 columns형식 다시한번 int로 바꿔주는거야
    undi_nodes_w_cate = undi_nodes_w_cate.apply(lambda x:x.astype(int))
except:
    pass


In [21]:
undi_nodes_w_cate.isnull().sum(axis=0)

pid         0
dcate_cd    0
scate_cd    0
mcate_cd    0
lcate_cd    0
dtype: int64

In [22]:
undi_nodes_w_cate[undi_nodes_w_cate.pid == 10000033] # 이렇게 바뀌게 되네!

Unnamed: 0,pid,dcate_cd,scate_cd,mcate_cd,lcate_cd
1839763,10000033,-9999,-9999,-9999,-9999


In [23]:
undi_nodes_w_cate.reset_index(drop=True, inplace=True)

---

- eg. 잠시 확인

In [None]:
import scipy.sparse as sp

In [None]:
# adj - matrix
adj = sp.coo_matrix( nx.adjacency_matrix(G) )

In [None]:
adj.get_shape() # 맞다!

(1106646, 1106646)



---


In [None]:
x = np.array([[0,2],
                [1,0]])

In [None]:
x

array([[0, 2],
       [1, 0]])

In [None]:
x_ = sp.coo_matrix(x)

In [None]:
x_.T.multiply(x_.T > x_).todense()

matrix([[0, 0],
        [2, 0]], dtype=int64)

In [None]:
x_r = x_ + x_.T.multiply(x_.T > x_) - x_.multiply( x_.T > x_)


In [None]:
x_r.todense()

matrix([[0, 2],
        [2, 0]], dtype=int64)



---



---



# 신경망 만들어보기

### - utils.py

In [24]:
import scipy.sparse as sp
import torch

def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

# index mapping
#idx = np.array( undi_nodes, dtype=np.int32)
#idx_map = {j: i for i, j in enumerate(idx)}
#######################################


# labels
def encode_onehot(labels): # 이거 굳이 안쓰고 pd.get_dummies() 쓰는게 더 효율적
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot

#labels = undi_nodes_w_categories['lcate_cd'] ) # 1차 : label= 'lcate' # 이렇게 써도 되지만, get_dummies()가 더 효율적인것 같다
labels = pd.get_dummies(undi_nodes_w_cate['lcate_cd'] ).values # 

# adj , features
adj = sp.coo_matrix( nx.adjacency_matrix(G) )  # adj 정의
#adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) #이미 directed -> undirected 로 바꾸면서 거친 과정이야!

features = sp.csr_matrix( pd.get_dummies(undi_nodes_w_cate['scate_cd'] ).values , dtype=np.float32 )  # 1차
#features = sp.csr_matrix( pd.get_dummies(undi_nodes_w_categories['lcate_cd'] ).values , dtype=np.float32 ) # 1.1 차

# normalize
#features = normalize(features)
adj = normalize(adj + sp.eye(adj.shape[0]))


## set as input type
features = torch.FloatTensor(np.array(features.todense())) # 처음에 dcate로 진행시 메모리터짐 - sparse matrix 로 바꾸고 싶었지만 실패 : 현재(1차)의 features는 scate이다
# features = sparse_mx_to_torch_sparse_tensor( features ) # sparse matrix로
labels = torch.LongTensor(np.where(labels)[1])
adj = sparse_mx_to_torch_sparse_tensor(adj)

#######################################



---



---



## - layers.py

In [25]:
import math

import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.sparse.mm(adj, support) # D-inv 는 어디있지?

        if self.bias is not None:
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'



---



---



 - cf. 참고

In [None]:
x = np.array([[1,2],
              [3,4]])

In [None]:
x

array([[1, 2],
       [3, 4]])

In [None]:
x = sp.csr_matrix(x)

In [None]:
x_s = sparse_mx_to_torch_sparse_tensor( x ) 

In [None]:
x_s.to_dense()

tensor([[1., 2.],
        [3., 4.]])

---

## models.py

In [26]:
import torch.nn as nn
import torch.nn.functional as F


class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, prelu_init):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nfeat, nhid)
        #self.gc2 = GraphConvolution(nhid, nhid)
        self.gc_last = GraphConvolution(nhid, nclass)
        self.dropout = dropout
        self.prelu_init = prelu_init

    def forward(self, x, adj):
        x = self.gc1(x, adj)
        embedding = F.prelu( x , self.prelu_init)
        x = F.dropout( embedding, self.dropout, training=self.training) # dropout
        #embedding = self.gc2(x, adj)
        #x = F.relu( embedding )
        #x = F.dropout( embedding, self.dropout, training=self.training) # dropout 
        x = self.gc_last( x, adj )
        return F.log_softmax(x, dim=1), embedding



---



---



## train

In [27]:
from __future__ import division
from __future__ import print_function

import time
import numpy as np

import torch
import torch.nn.functional as F
import torch.optim as optim

In [14]:
torch.cuda.is_available()

True

In [29]:
# 1203 - embdding: activation 탄 값 ( ELU )
# Model and optimizer
model = GCN(nfeat = features.shape[1],
            nhid = 128,
            nclass=labels.max().item() + 1,
            dropout = 0.5 ,
            prelu_init = torch.tensor(0.25) )

optimizer = optim.Adam(model.parameters(),
                       lr = 0.01) # 원래 weight_decay = 5e-4 였는데, prelu쓸때엔 learning-decay 빼주라네???



def train(epoch, get_embedding = False):
    if not get_embedding :
        t = time.time()
        model.train()
        optimizer.zero_grad()
        output, embedding = model(features, adj)
        loss_train = F.nll_loss(output, labels)
        acc_train = accuracy(output, labels )
        loss_train.backward()
        optimizer.step()
    else:
        t = time.time()
        model.train()
        optimizer.zero_grad()
        output, embedding = model(features, adj)
        loss_train = F.nll_loss(output, labels)
        acc_train = accuracy(output, labels )
        loss_train.backward()
        optimizer.step()
        return embedding



    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'acc_train: {:.4f}'.format(acc_train.item()),
          'time: {:.4f}s'.format(time.time() - t))




# Train model
epochs = 100 # set #epochs

t_total = time.time()
for epoch in range( epochs ): # epochs
    if epoch == epochs-1:
        embedding = train(epoch, True)
    else:
        train(epoch)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

# Testing


Epoch: 0001 loss_train: 4.1606 acc_train: 0.0048 time: 102.3672s
Epoch: 0002 loss_train: 4.1213 acc_train: 0.0467 time: 101.8689s
Epoch: 0003 loss_train: 4.0923 acc_train: 0.1111 time: 95.2567s
Epoch: 0004 loss_train: 4.0636 acc_train: 0.1943 time: 93.1090s
Epoch: 0005 loss_train: 4.0510 acc_train: 0.2674 time: 93.3826s
Epoch: 0006 loss_train: 4.0289 acc_train: 0.3470 time: 91.7709s
Epoch: 0007 loss_train: 3.9638 acc_train: 0.4229 time: 94.8665s
Epoch: 0008 loss_train: 3.8287 acc_train: 0.4769 time: 96.7463s
Epoch: 0009 loss_train: 3.6592 acc_train: 0.5105 time: 97.9121s
Epoch: 0010 loss_train: 3.4885 acc_train: 0.5314 time: 92.2906s
Epoch: 0011 loss_train: 3.3566 acc_train: 0.5450 time: 95.2523s
Epoch: 0012 loss_train: 3.2325 acc_train: 0.5559 time: 95.0424s
Epoch: 0013 loss_train: 3.1098 acc_train: 0.5673 time: 92.3674s
Epoch: 0014 loss_train: 2.9878 acc_train: 0.5811 time: 92.9896s
Epoch: 0015 loss_train: 2.8639 acc_train: 0.5984 time: 92.5419s
Epoch: 0016 loss_train: 2.7314 acc_tra

----
---

# * 학습 후, embedding  파일 만들기

In [30]:
embedding.shape # 

torch.Size([5591911, 128])

In [34]:
undi_nodes_w_cate.head()

Unnamed: 0,pid,dcate_cd,scate_cd,mcate_cd,lcate_cd
0,602610157,4100879,3100174,2100020,1100004
1,600515938,4100077,3100012,2100003,1100000
2,600185944,4100042,3100007,2100002,1100000
3,601529111,4105068,3102133,2100425,1100059
4,600016493,4102692,3100489,2100084,1100014


In [33]:
undi_nodes_w_cate.tail()

Unnamed: 0,pid,dcate_cd,scate_cd,mcate_cd,lcate_cd
5591906,146583113,4101359,3100258,2100504,1100006
5591907,319525783,4101359,3100258,2100504,1100006
5591908,125568429,4101359,3100258,2100504,1100006
5591909,102097148,4101359,3100258,2100504,1100006
5591910,100156718,4101359,3100258,2100504,1100006


In [32]:
embedding

tensor([[ 0.4463,  0.2773,  0.1779,  ...,  0.2993,  0.2003,  0.2330],
        [ 0.5161,  0.2814,  0.3335,  ...,  0.1356,  0.1705,  0.2941],
        [ 0.4172,  0.2520,  0.2921,  ...,  0.1771,  0.1808,  0.3127],
        ...,
        [-0.0223,  0.6326,  0.3706,  ...,  0.5878, -0.0456,  0.3477],
        [-0.0223,  0.6326,  0.3706,  ...,  0.5878, -0.0456,  0.3477],
        [-0.0223,  0.6326,  0.3706,  ...,  0.5878, -0.0456,  0.3477]],
       grad_fn=<PreluBackward>)

In [36]:
undi_nodes_w_cate.shape

(5591911, 5)

In [35]:
embedding_npy = embedding.detach().numpy()

In [41]:
# 혹시 몰라 저장
np.save('result_data/embedding_1209_prelu.npy',embedding_npy)

---

In [None]:
embedding_npy = np.load('embedding_1203.npy')

In [None]:
embedding_npy

array([[-0.06229699,  0.10632266,  0.13854109, ...,  0.1667273 ,
         0.17042798, -0.03680032],
       [-0.03422737,  0.08680156,  0.10839143, ...,  0.13637027,
         0.1410108 , -0.01611423],
       [-0.02145267,  0.08144503,  0.10349715, ...,  0.12403204,
         0.1262565 , -0.02063572],
       ...,
       [ 0.19105117, -0.08257723,  0.18580681, ..., -0.0757308 ,
         0.1456488 , -0.08505833],
       [-0.07341862, -0.0332883 , -0.10092348, ..., -0.12907463,
        -0.045708  ,  0.21251215],
       [ 0.17877957,  0.15571123, -0.08540243, ..., -0.07913971,
         0.13730708, -0.04629767]], dtype=float32)

---

## embedding + (undi_nodes_w_cate + deal_name_1209)

In [37]:
# 1. embedding
embedding_df = pd.DataFrame(embedding_npy )

In [38]:
embedding_df.iloc[0][0]

0.44628644

In [39]:
# 2. undi_nodes_w_cate
undi_nodes_w_cate.head()

Unnamed: 0,pid,dcate_cd,scate_cd,mcate_cd,lcate_cd
0,602610157,4100879,3100174,2100020,1100004
1,600515938,4100077,3100012,2100003,1100000
2,600185944,4100042,3100007,2100002,1100000
3,601529111,4105068,3102133,2100425,1100059
4,600016493,4102692,3100489,2100084,1100014


In [None]:
# 3. deal_name_1209

In [40]:
deal_name_1209 = pd.read_csv('result_data/deal_name_1209.csv', index_col=0)

  mask |= (ar1 == a)


In [41]:
deal_name_1209.head()

Unnamed: 0,pid,dealNm
0,600006061,독차지 강아지 배변판 배변망 대형 중형 애견 개 고양이 화장실
1,600011647,[ANF] 강아지껌 모음전 (묶음가능)
2,600019657,비프월드 제육볶음용 돼지고기500g 프리미엄미국산
3,600027074,[에코스토어] 뉴질랜드친환경/베이비스킨케어/친환경로션/아기로션/아기샴푸/바디워시/배...
4,600031762,빈슨메시프 가성비 캠핑침낭 모음-감성 사계절 극 동계


In [43]:
undi_nodes_w_cate_ = undi_nodes_w_cate[['pid', 'lcate_cd']].merge(deal_name_1209, how='left')

In [46]:
undi_nodes_w_cate_.head()

Unnamed: 0,pid,lcate_cd,dealNm
0,602610157,1100004,"쿠폰추가★ 11개구매시 쿠폰가8,170원! 한끼대용 간편선식 국내산100% 7가지맛..."
1,600515938,1100000,[실라리안] 국내산 돼지고기로 만든 가성비 갑 만두 모음!
2,600185944,1100000,[실라리안] 100% 우리콩 안동낫또 골라담기
3,601529111,1100059,"닥터딥 민감성피부관리 여드름성 피부, 아기피부, 수부지피부"
4,600016493,1100014,[쇼핑의모든것] 대천갓바위김 도시락김120봉 가격특가


In [44]:
undi_nodes_w_cate_.shape

(5591911, 3)

In [45]:
undi_nodes_w_cate_.isnull().sum(axis=0)

pid         0
lcate_cd    0
dealNm      0
dtype: int64

---

In [None]:
apply(lambda x: "".join(x.split()))

In [47]:
undi_nodes_w_cate_['dealNm'].apply(lambda x: "".join(x.split()) )

0          쿠폰추가★11개구매시쿠폰가8,170원!한끼대용간편선식국내산100%7가지맛!10+1/...
1                                  [실라리안]국내산돼지고기로만든가성비갑만두모음!
2                                      [실라리안]100%우리콩안동낫또골라담기
3                                닥터딥민감성피부관리여드름성피부,아기피부,수부지피부
4                                 [쇼핑의모든것]대천갓바위김도시락김120봉가격특가
                                 ...                        
5591906                                   LG와인3GT390메탈릭하드케이스
5591907           휴대폰핸드폰악세사리케이스아이폰6플러스아이메탈컬러젤리케이스갤럭시LG아이폰케이스
5591908                             1갤럭시A8(A800)퍼펙트올라운드하이브리드
5591909                        갤럭시S8플러스G955투명스탠드이중보호케이스(거치형)
5591910                     S4198가지디자인LGG6하드케이스/핸드폰케이스/커플케이스
Name: dealNm, Length: 5591911, dtype: object

In [49]:
# id = pid + dealNm
undi_nodes_w_cate_['id'] = undi_nodes_w_cate_['pid'].astype('str') + '_' + undi_nodes_w_cate_['dealNm'].apply(lambda x: "".join(x.split()) )

In [50]:
undi_nodes_w_cate_.head()

Unnamed: 0,pid,lcate_cd,dealNm,id
0,602610157,1100004,"쿠폰추가★ 11개구매시 쿠폰가8,170원! 한끼대용 간편선식 국내산100% 7가지맛...","602610157_쿠폰추가★11개구매시쿠폰가8,170원!한끼대용간편선식국내산100%..."
1,600515938,1100000,[실라리안] 국내산 돼지고기로 만든 가성비 갑 만두 모음!,600515938_[실라리안]국내산돼지고기로만든가성비갑만두모음!
2,600185944,1100000,[실라리안] 100% 우리콩 안동낫또 골라담기,600185944_[실라리안]100%우리콩안동낫또골라담기
3,601529111,1100059,"닥터딥 민감성피부관리 여드름성 피부, 아기피부, 수부지피부","601529111_닥터딥민감성피부관리여드름성피부,아기피부,수부지피부"
4,600016493,1100014,[쇼핑의모든것] 대천갓바위김 도시락김120봉 가격특가,600016493_[쇼핑의모든것]대천갓바위김도시락김120봉가격특가


In [51]:
undi_nodes_w_cate_[['id', 'lcate_cd']].head()

Unnamed: 0,id,lcate_cd
0,"602610157_쿠폰추가★11개구매시쿠폰가8,170원!한끼대용간편선식국내산100%...",1100004
1,600515938_[실라리안]국내산돼지고기로만든가성비갑만두모음!,1100000
2,600185944_[실라리안]100%우리콩안동낫또골라담기,1100000
3,"601529111_닥터딥민감성피부관리여드름성피부,아기피부,수부지피부",1100059
4,600016493_[쇼핑의모든것]대천갓바위김도시락김120봉가격특가,1100014


In [None]:
# 이걸 붙여주자 (label로 lcate_nm)
undi_nodes_w_categories[['id', 'lcate_cd']].merge(dlv_cat[['lcate_cd', 'lcate_nm']].drop_duplicates().reset_index(drop=True),
                                                    how='left' ).drop(columns=['lcate_cd'])

Unnamed: 0,id,lcate_nm
0,600181479_[만반잘부]오바다부드러운국내산오징어,신선식품
1,600186873_[만반잘부]주재원삼겹살/오겹살/목살500g모음전,신선식품
2,600185426_핫한청도아이스홍시&감말랭이모음전,신선식품
3,600118584_사과즙/도라지배즙/배도라지즙/포도즙/배즙/통째로과일즙,커피/음료
4,600186422_악세사리함쥬얼리함보석함시계보관함모음전,쥬얼리/시계
...,...,...
1106364,117164526_[5%적립]ThePrincetonCompaniontoMathema...,도서/교육/음반
1106365,159880060_버블뮤다이노캡미아방지배낭인형가방영유아가방,유아동 신발/잡화
1106366,101361801_[무료배송]키친아트아르떼열센서후라이팬30cm인덕션겸용,주방용품
1106367,156809981_인증점K제트VS20R9044SA무선청소기미세먼지차단베터리패키지,생활가전


In [None]:
undi_nodes_w_categories_  =  undi_nodes_w_categories[['id', 'lcate_cd']]\
                    .merge(dlv_cat[['lcate_cd', 'lcate_nm']].drop_duplicates().reset_index(drop=True), how='left' )\
                    .drop(columns=['lcate_cd'])

In [53]:
embedding_df = pd.concat([ undi_nodes_w_cate_[['id', 'lcate_cd']] , embedding_df ], axis=1);embedding_df.head(2)

Unnamed: 0,id,lcate_cd,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,"602610157_쿠폰추가★11개구매시쿠폰가8,170원!한끼대용간편선식국내산100%...",1100004,0.446286,0.277313,0.177863,0.224614,0.053937,0.116937,0.243198,0.347476,...,0.297818,-0.004374,0.139615,0.317857,0.093901,0.317629,0.224229,0.299337,0.200292,0.232984
1,600515938_[실라리안]국내산돼지고기로만든가성비갑만두모음!,1100000,0.516052,0.281411,0.333511,0.2548,-0.006607,0.032997,0.148115,0.360973,...,0.360331,0.038403,0.039156,0.249721,0.167012,0.295624,0.055712,0.135611,0.170498,0.29412


In [54]:
embedding_df.to_csv('embedding_1209.csv')

In [6]:
# 불러오기
embedding_df = pd.read_csv('embedding_1209.csv')
embedding_df = embedding_df.iloc[:,1:]

In [9]:
embedding_df.head()

Unnamed: 0,id,lcate_cd,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,"602610157_쿠폰추가★11개구매시쿠폰가8,170원!한끼대용간편선식국내산100%...",1100004,0.446286,0.277313,0.177863,0.224614,0.053937,0.116937,0.243198,0.347476,...,0.297818,-0.004374,0.139615,0.317857,0.093901,0.317629,0.224229,0.299337,0.200292,0.232984
1,600515938_[실라리안]국내산돼지고기로만든가성비갑만두모음!,1100000,0.516052,0.281411,0.333511,0.2548,-0.006607,0.032997,0.148115,0.360973,...,0.360331,0.038403,0.039156,0.249721,0.167012,0.295624,0.055712,0.135611,0.170498,0.29412
2,600185944_[실라리안]100%우리콩안동낫또골라담기,1100000,0.417226,0.252049,0.292064,0.212674,0.004232,0.063358,0.146772,0.339002,...,0.345188,0.015622,0.104531,0.276849,0.135741,0.272882,0.071401,0.177105,0.180849,0.31272
3,"601529111_닥터딥민감성피부관리여드름성피부,아기피부,수부지피부",1100059,0.421385,0.28896,0.143851,0.133606,0.082439,0.099255,0.236265,0.350072,...,0.334989,0.111873,0.071139,0.361715,0.05126,0.204689,0.237831,0.210794,0.070109,0.124177
4,600016493_[쇼핑의모든것]대천갓바위김도시락김120봉가격특가,1100014,0.544923,0.384857,0.284561,0.176304,-0.009811,-0.000901,0.03351,0.195942,...,0.45769,-0.0311,0.037204,0.394122,0.054656,0.29235,-0.01487,0.095671,0.238219,0.325416


---

## # word2vec 포멧으로

In [10]:
word2vec_fmt_1209 = [[5591911, 128]]

In [11]:
word2vec_fmt_1209.extend( embedding_df.drop(columns=['lcate_cd']).values.tolist() ) # ram 엄청 잡아먹는다

In [12]:
# 확인
len(word2vec_fmt_1209)

5591912

In [13]:
# 확인
word2vec_fmt_1209[:2]

[[5591911, 128],
 ['602610157_쿠폰추가★11개구매시쿠폰가8,170원!한끼대용간편선식국내산100%7가지맛!10+1/30+5/50+10추가증정!',
  0.44628644,
  0.27731293,
  0.17786291,
  0.22461429,
  0.05393718,
  0.11693694,
  0.24319808,
  0.34747642,
  0.24231112,
  0.08423729,
  0.36705879999999996,
  -0.034921084,
  0.13799348,
  0.20650537,
  0.2674704,
  0.2165,
  0.293976,
  0.35595274,
  0.12915194,
  0.057837263,
  0.38792548,
  0.10130472,
  0.3342236,
  0.17950791,
  0.03536048,
  0.27614784,
  0.2051489,
  0.10869560000000002,
  0.266669,
  0.33471546,
  0.24255787,
  0.087647945,
  0.16672163,
  0.31040156,
  0.19066848,
  0.15910175,
  0.16651805,
  0.27452627,
  0.12763159,
  0.3932799,
  0.0030662715000000004,
  0.15766427,
  0.17603076,
  0.16045116,
  0.14470905,
  0.037425898,
  0.16326863,
  0.21468759,
  0.10761005,
  0.16074449,
  0.12940636,
  0.32159945,
  0.19454089,
  0.16014813,
  -0.02174962,
  0.2377656,
  0.33787647,
  0.26465067,
  0.16954285,
  0.36503136,
  0.19451526,
  0.2825617,
  0.30363214,
  0

In [14]:
# save
with open('result_data/embedding_w2v_1209.txt', 'w') as f:
    for item in word2vec_fmt_1209:
        for k,element in enumerate(item):
            last = len(item)
            if k == last:
                f.write("%s" % element) # 마지막 요소는, 띄어쓰기 없다
            else:
                f.write("%s " % element)
        f.write("\n") # 한줄이 끝나면 줄 바꿈

In [None]:
# name 저장
embedding_df.iloc[:50000,:2].to_csv('1203_title_elu.tsv', header=True, index=False, sep='\t')

In [None]:
# embedding 저장
embedding_df.iloc[:50000,2:].to_csv('1203_embedding_elu.tsv', header=False, index=False, sep='\t')

---