In [5]:
import os
import json
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
pip install dgl dglgo -f https://data.dgl.ai/wheels/repo.html

In [53]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from dgl import DGLGraph
import re
import scipy.sparse as sp
import dgl.function as fn

In [54]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
from numpy import random
import networkx as nx

In [55]:
ratings = []
with open(os.path.join('/content/gdrive/MyDrive/Colab Notebooks/인턴/ml-1m/', 'ratings.dat'), encoding='latin1') as f:
    for l in f:
        user_id, movie_id, rating, timestamp = [int(_) for _ in l.split('::')]
        ratings.append({
            'user_id': user_id,
            'movie_id': movie_id,
            'rating': rating,
            'timestamp': timestamp,
        })
ratings = pd.DataFrame(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [56]:
user_id = ratings['user_id'].unique()
movie_id = ratings['movie_id'].unique()

In [57]:
user_id_len = len(user_id)
movie_id_len = len(movie_id)
totalid = user_id_len + movie_id_len                   #total id 개수는 9746 

totalid, user_id_len, movie_id_len

(9746, 6040, 3706)

In [58]:
min(movie_id), max(movie_id)

(1, 3952)

In [59]:
user_nodeidx = {}   #key는 user_id/ value는 node id 
id = 0         #노드 id는 0부터 6039 (마지막에 id는 테이블에 넣지 않았으니)

for i in range(user_id_len):
  user_nodeidx[user_id[i]] = id
  id += 1

In [60]:
len(user_nodeidx), id

(6040, 6040)

In [61]:
movie_nodeidx = {}  #key는 movie_id/ value는 node id 
id = 0

for i in range(movie_id_len): # 노드 id 6040 ~ 9745는 무비거 
  movie_nodeidx[movie_id[i]]  = id
  id += 1

In [62]:
len(movie_nodeidx), id

(3706, 3706)

In [63]:
# 유저노드와 무비노드의 각각의 id 테이블 

In [64]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [65]:
ratings.iloc[0, 1]

1193

In [66]:
users = []  #user_id에 해당하는 노드 id를 넣음 
movies = []

for i in range(len(ratings.index)):
  user = ratings.iloc[i,0]
  users.append(user_nodeidx[user])

  movie = ratings.iloc[i, 1]
  movies.append(movie_nodeidx[movie])

In [67]:
edges_src = torch.from_numpy(np.array(users))
edges_dst = torch.from_numpy(np.array(movies))

In [68]:
#Bipartite가 아닌 일단 그냥 hetero graph

graph_data = {
   ('user_id', 'score', 'movie_id'): (edges_src,edges_dst),
   ('movie_id', 'scored-by', 'user_id'): (edges_dst,edges_src)
}
he_g = dgl.heterograph(graph_data)

In [69]:
print(he_g)

Graph(num_nodes={'movie_id': 3706, 'user_id': 6040},
      num_edges={('movie_id', 'scored-by', 'user_id'): 1000209, ('user_id', 'score', 'movie_id'): 1000209},
      metagraph=[('movie_id', 'user_id', 'scored-by'), ('user_id', 'movie_id', 'score')])


In [70]:
edges_feature = torch.from_numpy(ratings['rating'].to_numpy()) #엣지 속성 부여 
edges_feature

tensor([5, 3, 3,  ..., 5, 4, 4])

In [71]:
he_g.edges['score'].data['rating'] = edges_feature

In [72]:
print(he_g)

Graph(num_nodes={'movie_id': 3706, 'user_id': 6040},
      num_edges={('movie_id', 'scored-by', 'user_id'): 1000209, ('user_id', 'score', 'movie_id'): 1000209},
      metagraph=[('movie_id', 'user_id', 'scored-by'), ('user_id', 'movie_id', 'score')])


In [73]:
featsize = 1000
#np.random.randint(start, end, number)  -> start~end범위 사이의 숫자 number개 

In [74]:
user_featsize = he_g.num_nodes('user_id')
movie_featsize = he_g.num_nodes('movie_id')
user_featsize, movie_featsize

(6040, 3706)

In [75]:
user_feat = torch.randn(user_featsize, featsize)
user_feat.shape

torch.Size([6040, 1000])

In [76]:
movie_feat = torch.randn(movie_featsize, featsize)
movie_feat.shape

torch.Size([3706, 1000])

In [77]:
he_g.nodes['user_id'].data['feature'] = user_feat
he_g.nodes['movie_id'].data['feature'] = movie_feat

In [78]:
he_g

Graph(num_nodes={'movie_id': 3706, 'user_id': 6040},
      num_edges={('movie_id', 'scored-by', 'user_id'): 1000209, ('user_id', 'score', 'movie_id'): 1000209},
      metagraph=[('movie_id', 'user_id', 'scored-by'), ('user_id', 'movie_id', 'score')])

In [79]:
class HeteroDotProductPredictor(nn.Module):
    def forward(self, graph, h, etype):
        # h contains the node representations for each node type computed from
        # the GNN defined in the previous section (Section 5.1).
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
            return graph.edges[etype].data['score']

In [80]:
def construct_negative_graph(graph, k, etype):
    utype, _, vtype = etype
    src, dst = graph.edges(etype=etype)
    neg_src = src.repeat_interleave(k)
    neg_dst = torch.randint(0, graph.num_nodes(vtype), (len(src) * k,))
    return dgl.heterograph(
        {etype: (neg_src, neg_dst)},
        num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})

In [81]:
class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = dgl.nn.HeteroGraphConv({
            rel: dgl.nn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dgl.nn.HeteroGraphConv({
            rel: dgl.nn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')

    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = {k: F.relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [82]:
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names):
        super().__init__()
        self.sage = RGCN(in_features, hidden_features, out_features, rel_names)
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return self.pred(g, h, etype), self.pred(neg_g, h, etype)

In [83]:
def compute_loss(pos_score, neg_score):
    # Margin loss
    n_edges = pos_score.shape[0]
    return (1 - pos_score + neg_score.view(n_edges, -1)).clamp(min=0).mean()

k = 5
model = Model(featsize, 200, 100, he_g.etypes)
user_feats = he_g.nodes['user_id'].data['feature']
item_feats =he_g.nodes['movie_id'].data['feature']
node_features = {'user_id': user_feats, 'movie_id': item_feats}


opt = torch.optim.Adam(model.parameters())
for epoch in range(10):
    negative_graph = construct_negative_graph(he_g, k,  ('user_id', 'score', 'movie_id')) 
    pos_score, neg_score = model(he_g, negative_graph, node_features,  ('user_id', 'score', 'movie_id'))
    loss = compute_loss(pos_score, neg_score)
    opt.zero_grad()
    loss.backward()
    opt.step()
    print(loss.item())



1.001032829284668
0.9103068709373474
0.8020084500312805
0.673152506351471
0.5630406737327576
0.51024329662323
0.513797402381897
0.5463196635246277
0.5770770907402039
0.5869935750961304


In [87]:
node_embeddings = model.sage(he_g, node_features)
node_embeddings

{'movie_id': tensor([[ 3.3609e-01,  2.1784e-01, -3.3297e-01,  ...,  2.1663e-01,
           4.9422e-01, -3.3710e-01],
         [ 1.8973e-01,  1.3951e-01, -1.6907e-01,  ...,  1.3156e-01,
           2.5287e-01, -2.0049e-01],
         [ 1.9714e-01,  1.0286e-01, -1.7757e-01,  ...,  1.4408e-01,
           2.7083e-01, -2.0726e-01],
         ...,
         [ 1.5565e-02,  7.3231e-03, -2.1240e-02,  ...,  1.9399e-02,
           1.8284e-02, -6.7852e-03],
         [ 1.1214e-02,  3.2417e-04, -2.5979e-03,  ...,  5.1333e-03,
           1.3954e-02, -6.9160e-03],
         [ 7.8178e-03,  1.1520e-02, -2.5206e-02,  ...,  6.6804e-03,
           8.6156e-03, -1.4852e-02]], grad_fn=<SumBackward1>),
 'user_id': tensor([[ 0.0833, -0.0152, -0.0747,  ...,  0.0684,  0.0907, -0.0764],
         [ 0.1186, -0.0162, -0.1223,  ...,  0.0916,  0.1414, -0.1085],
         [ 0.0790, -0.0133, -0.0932,  ...,  0.0658,  0.0908, -0.0738],
         ...,
         [ 0.0561, -0.0153, -0.0597,  ...,  0.0396,  0.0641, -0.0413],
         

In [88]:
node_embeddings['movie_id'].shape

torch.Size([3706, 100])

In [84]:
movie_nodeidx[3702], movie_nodeidx[3703], movie_nodeidx[3704], movie_nodeidx[1608], movie_nodeidx[1721], movie_nodeidx[111], movie_nodeidx[215]

(211, 664, 666, 895, 27, 486, 232)

In [90]:
# 벡터 유사도 측정 함수
def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

In [93]:
cos_sim(node_embeddings['movie_id'][664].detach().numpy() , node_embeddings['movie_id'][211].detach().numpy())

0.9998912

In [95]:
cos_sim(node_embeddings['movie_id'][664].detach().numpy(), node_embeddings['movie_id'][666].detach().numpy())

0.9996429

In [96]:
cos_sim(node_embeddings['movie_id'][664].detach().numpy(), node_embeddings['movie_id'][895].detach().numpy())

0.9978579

In [97]:
cos_sim(node_embeddings['movie_id'][664].detach().numpy(), node_embeddings['movie_id'][27].detach().numpy())

0.99756974

In [98]:
cos_sim(node_embeddings['movie_id'][664].detach().numpy(), node_embeddings['movie_id'][486].detach().numpy())

0.9983871

In [99]:
cos_sim(node_embeddings['movie_id'][664].detach().numpy(), node_embeddings['movie_id'][232].detach().numpy()) #비포선라이즈가 낮기는 한듯? 

0.99510753

In [92]:
cos_sim(node_embeddings['movie_id'][211].detach().numpy(), node_embeddings['user_id'][0].detach().numpy() ) #유저-영화가 구분이 되기한다. 아주 살짝. 

0.9105413

In [100]:
cos_sim(node_embeddings['movie_id'][211].detach().numpy(), node_embeddings['user_id'][100].detach().numpy() )

0.91069055