In [1]:
"""
Prepares random walk from the training edges 
Returns 10 random walk per edge for length 10
"""
import argparse
import csv
import gzip
from typing import List
from ast import literal_eval
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
from pprint import pprint
from tqdm import tqdm

In [2]:
import networkx
import scipy as sc

In [40]:
from typing import Any

In [44]:
import random
import pickle

In [3]:
read_path = '../data/interim/meta_Electronics_edges_train.edgelist'

In [4]:
graph = networkx.read_weighted_edgelist(read_path)

In [5]:
print(f'No. of edges = {graph.number_of_edges()}')
print(f'No. of nodes = {graph.number_of_nodes()}')

No. of edges = 2121160
No. of nodes = 464978


In [6]:
node_dict = {i : key for i, key in tqdm(enumerate(graph.nodes.keys()))}

464978it [00:00, 2498816.85it/s]


### Creating Transition Matrix

In [8]:
## creating transition matrix
adjacency_matrix = networkx.adj_matrix(graph)

Use `adjacency_matrix` instead

  adjacency_matrix = networkx.adj_matrix(graph)
  return adjacency_matrix(G, nodelist, dtype, weight)


In [10]:
print(f'Shape of the adjacency matrix = {adjacency_matrix.shape}')

Shape of the adjacency matrix = (464978, 464978)


In [11]:
degree_vector = sc.sparse.csr_matrix(1/np.sum(adjacency_matrix, axis = 0))

In [18]:
transition_matrix = adjacency_matrix.multiply(degree_vector).T

In [22]:
transition_dict = {}
rows, cols = transition_matrix.nonzero()

prev_row = -1
for row, col in tqdm(zip(rows, cols)):
    if row != prev_row:
        transition_dict.setdefault(row, {})
        transition_dict[row].setdefault('product', [])
        transition_dict[row].setdefault('probability', [])
    transition_dict[row]['product'].append(col)
    transition_dict[row]['probability'].append(transition_matrix[row, col])
    prev_row = row

4242320it [01:16, 55182.16it/s]


In [25]:
print(f'No. of keys in the transtion dict = {len(transition_dict)}')

No. of keys in the transtion dict = 464978


In [26]:
graph = None
transition_matrix = None

### Create the random walk 

In [33]:
n_nodes = len(node_dict)
samples_per_node = 10
sequence_len = 10

sample_array = np.zeros((n_nodes*samples_per_node, sequence_len), dtype = int)
print(f'Shape of the sample array = {sample_array.shape}') # 10 samples per node, random walk of length 10 per node

Shape of the sample array = (4649780, 10)


In [34]:
for node_idx in tqdm(range(n_nodes)):
    if node_idx % 100000 == 0:
        print(f'Getting samples for the node {node_idx}/{n_nodes}')
    for sample_idx in range(samples_per_node):
        node = node_idx
        
        for seq_idx in range(sequence_len):
            sample_array[node_idx*samples_per_node + sample_idx, seq_idx] = node
            node = random.choices(population=transition_dict[node]['product'],
                                  weights=transition_dict[node]['probability'], k = 1)[0]
        

  0%|                                                                                                                               | 232/464978 [00:00<06:39, 1164.19it/s]

Getting samples for the node 0/464978


 22%|██████████████████████████▋                                                                                                 | 100174/464978 [01:13<04:12, 1446.08it/s]

Getting samples for the node 100000/464978


 43%|█████████████████████████████████████████████████████▍                                                                      | 200199/464978 [02:21<02:52, 1533.50it/s]

Getting samples for the node 200000/464978


 65%|████████████████████████████████████████████████████████████████████████████████                                            | 300241/464978 [03:26<01:43, 1593.84it/s]

Getting samples for the node 300000/464978


 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 400260/464978 [04:29<00:39, 1628.45it/s]

Getting samples for the node 400000/464978


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464978/464978 [05:09<00:00, 1503.61it/s]


In [35]:
print(f'Shape of the random walk array - {sample_array.shape}')

Shape of the random walk array - (4649780, 10)


In [36]:
sample_array = np.vectorize(node_dict.get)(sample_array)

In [37]:
sample_array[0]

array(['b00f37z8q6', 'b01lzhryjs', 'b00hmbms50', 'b01bse454s',
       'b00xks3t9i', 'b01bse454s', 'b00f19q7yi', 'b00pq4acq2',
       'b00f19q7yi', 'b0156mfbi2'], dtype='<U10')

In [42]:
node_dict

{0: 'b00f37z8q6',
 1: 'b00priwjay',
 2: 'b005p9g7dk',
 3: 'b00fzpdg1k',
 4: 'b003sw13wq',
 5: 'b004yi8eso',
 6: 'b004rym2he',
 7: 'b00v7cbh6g',
 8: 'b00idan8h6',
 9: 'b009tnagda',
 10: 'b000bmsyby',
 11: 'b00hapuc88',
 12: 'b01g5w2i8s',
 13: 'b01kh81hcy',
 14: 'b00im3p8gs',
 15: 'b00rvh0wbw',
 16: 'b000efijta',
 17: 'b01cw4cems',
 18: 'b00x9m1cpi',
 19: 'b01enxvw2y',
 20: 'b00cwlshuk',
 21: 'b00jwue3es',
 22: 'b01b0c5lq4',
 23: 'b07cyqxwg9',
 24: 'b001a1poiq',
 25: 'b013js87ko',
 26: 'b00q7385jw',
 27: 'b009fuf6dm',
 28: 'b0017k6bdw',
 29: 'b013skibl8',
 30: 'b009xdyuba',
 31: 'b014z7ily0',
 32: 'b006tvqu6c',
 33: 'b01b72vxe6',
 34: 'b00shzycg8',
 35: 'b01gv9h1rs',
 36: 'b00t85ph2y',
 37: 'b00ia9lvoc',
 38: 'b00d5yk6mk',
 39: 'b01it0tdy6',
 40: 'b0178ytel6',
 41: 'b00l6c8pn0',
 42: 'b00e3d8lqu',
 43: 'b000dzry9c',
 44: 'b01d2hixpq',
 45: 'b01kaks5pg',
 46: 'b01hcwa576',
 47: 'b00nez6ow6',
 48: 'b00p9ftc6o',
 49: 'b01018db2e',
 50: 'b013g4egvu',
 51: 'b00c7rnoxo',
 52: 'b00w4zbfjo',
 53

In [46]:
def save_model(model: Any, model_path: str) -> None:
    """
    Saves model in gzip format

    Args:
        model: Model to be saved
        model_path: Path to save model to
        
    Returns:
        (None)
    """
    with gzip.open(model_path, "wb") as f:
        pickle.dump(model, f)

    print(f'Model saved to {model_path}')

In [48]:
graph_name = 'meta_Electronics'
save_model(node_dict, f"../data/processed/{graph_name}_node_dict.tar.gz")
node_dict = None

Model saved to ../data/processed/meta_Electronics_node_dict.tar.gz


In [49]:
save_model(transition_dict, f'../data/processed/{graph_name}_transition_dict.tar.rz')
transition_dict = None

Model saved to ../data/processed/meta_Electronics_transition_dict.tar.rz
