<a href="https://colab.research.google.com/github/BonanYang/git_Graph/blob/master/GNN_DeepWalk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
import networkx as nx
import pandas as pd
import matplotlib as plt
import random
from tqdm import tqdm
import torch
from torch import nn as nn
import torch.optim as optim
import numpy as np
from gensim.models import Word2Vec

In [4]:
df = pd.read_csv('seealsology-data.tsv',sep='\t')
G = nx.from_pandas_edgelist(df)


In [5]:
def get_random_walk(node,length):
  random_walk = [node]
  for _ in range(length):
    temp = list(G.neighbors(node))
    temp = list(set(temp)-set(random_walk))
    if len(temp) == 0:
      break
    rn = random.choice(temp)
    random_walk.append(rn)
    node = rn
  return random_walk

get_random_walk('support-vector machine', 5)

['support-vector machine',
 'fisher kernel',
 'support vector machine',
 'kernel machines',
 'similarity learning',
 'kernel trick']

In [6]:
gamma = 10
walk_len= 5
data = []
n = list(G.nodes())

for i in tqdm(n):
  for _ in range(gamma):
    data.append(get_random_walk(i,walk_len))


100%|██████████| 8560/8560 [00:02<00:00, 2873.59it/s]


In [7]:
data[101]

['space mapping',
 'machine learning',
 'gene expression programming',
 'artificial intelligence',
 'informatics (academic field)',
 'robotics']

In [8]:
model = Word2Vec(vector_size=256,
                 window=4,
                 sg=1,
                 hs=0,
                 negative=10,
                 alpha=0.03,
                 min_alpha=0.0007,
                 seed=14
                )

In [9]:
model.build_vocab(data, progress_per=2)
model.train(data, total_examples=model.corpus_count, epochs=50, report_delay=1)

(17825702, 17829400)

In [6]:
class W2v:
  def __init__(self,lr=0.001,embedding_dim=100,window_size=2):
    self.window_size = window_size
    self.embedding_dim = embedding_dim
    self.lr =lr
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  def build_vocab(self,data):
    vocab = set()
    for i in data:
      vocab.update(i)
    vocab = list(vocab)
    self.word2idx = {j:i for i,j in enumerate(vocab)}
    self.idx2word = {i:j for i,j in enumerate(vocab)}
    self. vocab_size = len(vocab)
    return vocab

  def training_data(self,data):
    training_d = []
    for d in data:
      for i in range(self.window_size,len(d)-self.window_size):
        center_word = d[i]
        for j in range(i-self.window_size,i+self.window_size+1):
          if j!= i:
            context_word = d[j]
            training_d.append((d[i],d[j]))
    return training_d

  def build_model(self):
    class skipGram(nn.Module):
      def __init__(self,vocab_size, embedding_dim):
        super().__init__()
        self.center_embedding = nn.Embedding(vocab_size,embedding_dim)
        self.context_embedding = nn.Embedding(vocab_size,embedding_dim)

      def forward(self,center_words,context_words):
        center_embeds = self.center_embedding(center_words)
        context_embeds = self.context_embedding(context_words)
        score = torch.sum(center_embeds * context_embeds, dim=1)
        return score

    self.model = skipGram(self.vocab_size,self.embedding_dim).to(self.device)
    self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
    self.loss_function = nn.BCEWithLogitsLoss()

  def train(self,data,epochs=5,batch_size=10,negative_samples=5):
    self.build_vocab(data)
    t_data = self.training_data(data)
    self.build_model()
    self.model.train()
    for e in range(epochs):
      total_loss = 0
      np.random.shuffle(t_data)
      for j in range(0,len(t_data),batch_size):
        batch = t_data[j:j+batch_size]
        center_words = []
        context_words = []
        labels = []
        for cent,ctx in batch:
          cent_idx = self.word2idx[cent]
          ctx_idx = self.word2idx[ctx]
          center_words.append(cent_idx)
          context_words.append(ctx_idx)
          labels.append(1.0)

          for _ in range(negative_samples):
              neg_context = np.random.randint(0, self.vocab_size)
              center_words.append(cent_idx)
              context_words.append(neg_context)
              labels.append(0.0)

        center_words = torch.tensor(center_words, dtype=torch.long).to(self.device)
        context_words = torch.tensor(context_words, dtype=torch.long).to(self.device)
        labels = torch.tensor(labels, dtype=torch.float).to(self.device)
        self.optimizer.zero_grad()
        scores = self.model(center_words, context_words)
        loss = self.loss_function(scores, labels)
        loss.backward()
        self.optimizer.step()
        total_loss += loss.item()


      avg_loss = total_loss / (len(t_data) // batch_size)
      print(f'Epoch {e+1}/{epochs}, Loss: {avg_loss:.4f}')



  def get_word_emb():
    pass



In [7]:
model = W2v()
model.train(data)
# a,b,c = model.train(data)
# # model.build_vocab(data)
# # model.training_data(data)
# # model.build_model()
# a.shape,b.shape,c.shape
# a[2]


Epoch 1/5, Loss: 3.1828
Epoch 2/5, Loss: 1.7133
Epoch 3/5, Loss: 0.5726
Epoch 4/5, Loss: 0.2574
Epoch 5/5, Loss: 0.1501
