# Parse dataset into dict

In [90]:
def parse_txt(filename, oriented=True):
    '''
    Via sets, but it is not JSON serializable
    '''
    data = {}
    with open(filename) as file:
        
        line = file.readline()
        while line:
            
            # skip comments
            if line[0] == '#':
                line = file.readline()
                continue
            
            parent, child = line.split()
            parent = int(parent)
            child = int(child)
            
            # rows in data file can be duplicated
            if parent in data:
                data[parent]['linked'].add(child)
                data[parent]['degree'] += 1
            else:
                data[parent] = {
                    'linked': {child},
                    'degree': 1,
                    'marked': False
                }
            
            # add reversed link on parent vertice for not oriented graph
            if not oriented:
                if child in data:
                    data[child]['linked'].add(parent)
                    data[child]['degree'] += 1
                else:
                    data[child] = {
                        'linked': {parent},
                        'degree': 1,
                        'marked': False
                    }

            line = file.readline()
            
    return data


def parse(filename, oriented=True):
    if filename.split('.')[-1] == 'txt':
        return parse_txt(filename, oriented)

In [1]:
def parse_txt(filename, oriented=True):
    '''
    Via lists
    '''
    data = {}
    with open(filename) as file:
        
        line = file.readline()
        while line:
            
            # skip comments
            if line[0] == '#':
                line = file.readline()
                continue
            
            parent, child = line.split()
            parent = int(parent)
            child = int(child)
            
            # rows in data file can be duplicated
            if parent in data:
                if child not in data[parent]['linked']:
                    data[parent]['linked'].append(child)
                    data[parent]['degree'] += 1
            else:
                data[parent] = { 
                    'linked': [child],
                    'distances': {},
                    'degree': 1,
                    'marked': False
                }
                
            if not oriented:
                if child in data:
                    if parent not in data[child]['linked']:
                        data[child]['linked'].append(parent)
                        data[child]['degree'] += 1

                else:    
                    data[child] = {
                        'linked': [parent],
                        'distances': {},
                        'degree': 1,
                        'marked': False
                    }

            line = file.readline()
            
    return data
def parse(filename, oriented=True):
    if filename.split('.')[-1] == 'txt':
        return parse_txt(filename, oriented)

In [21]:
from collections import deque 

def count_distance(vertice, data):
    '''
    counts distances form given vertice to all other in connectivity component that vertice belongs to
    based on BFS
    vertice: index of source vertice
    data: dict with information about graph
    '''
    current_distance = 0
    d0 = deque()
    d1 = deque()
    
    d0.append(vertice)
    data[vertice]['marked'] = True
    
    while True:
        
        if not d0 and not d1:
            break
        
        if current_distance % 2 == 0:
            
            v = d0.pop()

            data[v]['distances'][vertice] = current_distance
                
            for i in data[v]['linked']:
                if not data[i]['marked']:
                    d1.append(i)
                    data[i]['marked'] = True
            
            if not d0:
                current_distance += 1
                
        else:
            
            v = d1.pop()

            data[v]['distances'][vertice] = current_distance
                
            for i in data[v]['linked']:
                if not data[i]['marked']:
                    d0.append(i)
                    data[i]['marked'] = True
            
            if not d1:
                current_distance += 1

        
    for key, value in data.items():
        value['marked'] = False

    
def count_distances(data):
    for key, value in data.items():
        count_distance(key, data)
        

def bfs(source, stock, data):
    '''
    Count distance from source to stock.
    '''
    current_distance = 0
    d0 = deque()
    d1 = deque()
    
    d0.append(source)
    data[source]['marked'] = True
    
    while True:
        
        if not d0 and not d1:
            current_distance = -1
            break
        
        if current_distance % 2 == 0:
            
            v = d0.pop()
            
            if v == stock:
                break
                
            for i in data[v]['linked']:                
                if not data[i]['marked']:
                    d1.append(i)
                    data[i]['marked'] = True
            
            if not d0:
                current_distance += 1
                
        else:
            
            v = d1.pop()
            
            if v == stock:
                break
                
            for i in data[v]['linked']:
                if not data[i]['marked']:
                    d0.append(i)
                    data[i]['marked'] = True
            
            if not d1:
                current_distance += 1 
        
    for key, value in data.items():
        value['marked'] = False
        
    return current_distance

In [2]:
data = parse('test.txt', oriented=False)

In [None]:
print(data)

# Select landmarks random

In [20]:
import random

data_items = data.items()
vertices = [i[0] for i in data_items]

graph_size = len(data_items)
percent_of_landmarks = 10
number_of_landmarks = int(graph_size * (percent_of_landmarks / 100))

random_indexes = random.sample(range(0,graph_size), number_of_landmarks)
landmarks = [vertices[i] for i in random_indexes]

print(graph_size)
print(number_of_landmarks)
print(landmarks)

186
18
[125345, 63769, 50991, 45098, 95531, 62496, 37861, 69582, 276, 45009, 32432, 112243, 95860, 72233, 68320, 55528, 94235, 28787]


# Select landmarks using their degree

In [3]:
data_sorted = sorted(data.items(), key=lambda x: x[1]['degree'], reverse=True)
graph_size = len(data_sorted)

percent_of_landmarks = 10
number_of_landmarks = int(graph_size * (percent_of_landmarks / 100))

landmarks = [i[0] for i in data_sorted[:number_of_landmarks]]

print(graph_size)
print(number_of_landmarks)
print(landmarks)
# print(data_sorted)

186
18
[94138, 84424, 89308, 63225, 73543, 21937, 61571, 64124, 89994, 104802, 109016, 276, 1662, 5089, 6058, 6229, 10639, 16442]


# Select landmarks using their closeness centrality

In [23]:
import random

data_items = data.items()
vertices = [i[0] for i in data_items]

graph_size = len(data_items)
percent_of_landmarks = 10
number_of_landmarks = int(graph_size * (percent_of_landmarks / 100))

# select random seeds from input graph to calculate their closeness centrality
random_indexes = random.sample(range(0,graph_size), number_of_landmarks)
seeds = [vertices[i] for i in random_indexes]

for seed in seeds:
    count_distance(seed, data)
    
print(data)

{84424: {'linked': [276, 1662, 5089, 6058, 6229, 10639, 16442, 19325, 19834, 20113, 21937, 25452, 26902, 29829, 30222, 32432, 33040, 39238, 39521, 41418, 45009, 45098, 45242, 47005, 47968, 47999, 49934, 50220, 50897, 51730, 53681, 57537, 58458, 59326, 61571, 63552, 64124, 64568, 66200, 69839, 72391, 73543, 76259, 77098, 77915, 78627, 83560, 85420, 88768, 89131, 89308, 89994, 90506, 91060, 92387, 93296, 94138, 94329, 95070, 95531, 96570, 97101, 98506, 99104, 104802, 106611, 107829, 109016, 112605, 117751, 122908, 124023, 125190, 130825, 132445], 'distances': {118958: 2, 19325: 1, 41418: 1, 11580: 2, 122003: 2, 21937: 1, 50991: 2, 47999: 1, 87237: 2, 59151: 2, 53890: 2, 78627: 1, 1662: 1, 15829: 2, 95531: 1, 59326: 1, 118859: 2, 132104: 2, 33777: 2, 96570: 1, 77915: 1, 92552: 2, 56117: 2, 26902: 1, 11050: 2, 120454: 2, 20113: 1, 57537: 1, 64568: 1, 2175: 2, 6058: 1}, 'degree': 75, 'marked': False}, 276: {'linked': [84424], 'distances': {118958: 3, 19325: 2, 41418: 2, 11580: 3, 122003: 3,

In [29]:
d = {
    1: {
        'linked': [2,3,4,10],
        'distances':{},
        'marked': False
    },
    2: {
        'linked': [1,5,6,7],
        'distances':{},
        'marked': False
    },
    3: {
        'linked': [1,8],
        'distances':{},
        'marked': False
    },
    4: {
        'linked': [1,9,10],
        'distances':{},
        'marked': False
    },
    5: {
        'linked': [2],
        'distances':{},
        'marked': False
    },
    6: {
        'linked': [2],
        'distances':{},
        'marked': False
    },
    7: {
        'linked': [2],
        'distances':{},
        'marked': False
    },
    8: {
        'linked': [3],
        'distances':{},
        'marked': False
    },
    9: {
        'linked': [4],
        'distances':{},
        'marked': False
    },
    10: {
        'linked': [4,1],
        'distances':{},
        'marked': False
    }
}

print(bfs(1, 3, d))

0
