In [1]:
def parse_txt(filename, oriented=True):
    '''
    Parse data from txt file into dict python type.
    JSON serializable.
    '''
    data = {}
    with open(filename) as file:
        
        line = file.readline()
        while line:
            
            # skip comments
            if line[0] == '#':
                line = file.readline()
                continue
            
            parent, child = line.split()
            parent = int(parent)
            child = int(child)
            
            # rows in data file can be duplicated
            if parent in data:
                if child not in data[parent]['linked']:
                    data[parent]['linked'].append(child)
                    data[parent]['degree'] += 1
            else:
                data[parent] = { 
                    'linked': [child],
                    'distances': {},
                    'degree': 1,
                    'centrality': 0,
                    'marked': False,
                    'active': True
                }
                
            if not oriented:
                if child in data:
                    if parent not in data[child]['linked']:
                        data[child]['linked'].append(parent)
                        data[child]['degree'] += 1

                else:    
                    data[child] = {
                        'linked': [parent],
                        'distances': {},
                        'degree': 1,
                        'centrality': 0,
                        'marked': False,
                        'active': True
                    }

            line = file.readline()
            
    return data
def parse(filename, oriented=True):
    if filename.split('.')[-1] == 'txt':
        return parse_txt(filename, oriented)

In [2]:
from collections import deque 

def count_distance(vertice, data, h = -1, full=False):
    '''
    Counts distances form given vertice to all other in connectivity component that vertice belongs to.
    Also, of h parameter is provided, this method finds list of vertices which are h or less away from provided vertice.
    (As only distance from provided vertive becomes more then h method stops.)
    Based on BFS.
    vertice: index of source vertice
    data: dict with information about graph
    h: distance to closest vertices
    fill: complete BFS in spite of current distance > h
    '''
    current_distance = 0
    centrality = 0
    vertices_number = 1
    nearest_vertices = []
    d0 = deque()
    d1 = deque()
    
    source_active = data[vertice]['active']
    
    d0.append(vertice)
    data[vertice]['marked'] = True
    
    while True:
        
        if (not d0 and not d1) or (h != -1 and current_distance > h and not full):
            break
        
        if current_distance % 2 == 0:
            
            v = d0.pop()
                
            for i in data[v]['linked']:
                if not data[i]['marked']:
                    d1.append(i)
                    data[i]['marked'] = True
            
            data[v]['distances'][vertice] = current_distance
            vertices_number += 1
            centrality += current_distance
            
            if h != -1 and current_distance <= h:
                data[v]['active'] = False
                nearest_vertices.append(v)
                
            if not d0:
                current_distance += 1
                
        else:
            
            v = d1.pop()
                
            for i in data[v]['linked']:
                if not data[i]['marked']:
                    d0.append(i)
                    data[i]['marked'] = True
            
            data[v]['distances'][vertice] = current_distance
            vertices_number += 1
            centrality += current_distance
            if h != -1 and current_distance <= h:
                data[v]['active'] = False
                nearest_vertices.append(v)
            
            if not d1:
                current_distance += 1
    
    # rollback data
    for key, value in data.items():
        value['marked'] = False
    
    # set initial status
    data[vertice]['active'] = source_active
    
    if h == -1 or full:
        data[vertice]['centrality'] = centrality / vertices_number
    else:
        return nearest_vertices

    
def count_distances(data):
    """
    Counts distances between all nodes in graph.
    """
    for key, value in data.items():
        count_distance(key, data)
        

def bfs(source, stock, data):
    '''
    Count distance from source to stock without using landmarks.
    '''
    current_distance = 0
    d0 = deque()
    d1 = deque()
    
    d0.append(source)
    data[source]['marked'] = True
    
    while True:
        
        if not d0 and not d1:
            current_distance = -1
            break
        
        if current_distance % 2 == 0:
            
            v = d0.pop()
            
            if v == stock:
                break
                
            for i in data[v]['linked']:                
                if not data[i]['marked']:
                    d1.append(i)
                    data[i]['marked'] = True
            
            if not d0:
                current_distance += 1
                
        else:
            
            v = d1.pop()
            
            if v == stock:
                break
                
            for i in data[v]['linked']:
                if not data[i]['marked']:
                    d0.append(i)
                    data[i]['marked'] = True
            
            if not d1:
                current_distance += 1 
        
    for key, value in data.items():
        value['marked'] = False
        
    return current_distance

In [16]:
data = parse('test.txt', oriented=False)

In [11]:
print(data)

{84424: {'linked': [276, 1662, 5089, 6058, 6229, 10639, 16442, 19325, 19834, 20113, 21937, 25452, 26902, 29829, 30222, 32432, 33040, 39238, 39521, 41418, 45009, 45098, 45242, 47005, 47968, 47999, 49934, 50220, 50897, 51730, 53681, 57537, 58458, 59326, 61571, 63552, 64124, 64568, 66200, 69839, 72391, 73543, 76259, 77098, 77915, 78627, 83560, 85420, 88768, 89131, 89308, 89994, 90506, 91060, 92387, 93296, 94138, 94329, 95070, 95531, 96570, 97101, 98506, 99104, 104802, 106611, 107829, 109016, 112605, 117751, 122908, 124023, 125190, 130825, 132445], 'distances': {}, 'degree': 75, 'centrality': 0, 'marked': False, 'active': True}, 276: {'linked': [84424], 'distances': {}, 'degree': 1, 'centrality': 0, 'marked': False, 'active': True}, 1662: {'linked': [84424], 'distances': {}, 'degree': 1, 'centrality': 0, 'marked': False, 'active': True}, 5089: {'linked': [84424], 'distances': {}, 'degree': 1, 'centrality': 0, 'marked': False, 'active': True}, 6058: {'linked': [84424], 'distances': {}, 'deg

In [17]:
import random

def select_landmarks(data: dict, number_of_landmarks = 0.1, ranking: str = 'degree', h: int = 1):
    """
    Select landmarks using constratined strategy with provided h and ranking parameters.
    Set h to -1 to get top 'number_of_landmarks' ranked vertices.
    Possible ranking: degree, random, closeness.
    """
    
    data_items = data.items()
    graph_size = len(data_items)
    print('Graph size: ' + str(graph_size))
    
    number_of_landmarks = int(
        graph_size * (number_of_landmarks / 100)
    ) if number_of_landmarks >= 1 else int(
        graph_size * number_of_landmarks
    )
    
    landmarks = []
    
    if ranking == 'degree':
        data_sorted = sorted(data_items, key=lambda x: x[1]['degree'], reverse=True)
        
        if h == -1:
            return [i[0] for i in data_sorted[:number_of_landmarks]]
        
        while len(landmarks) < number_of_landmarks and data_sorted:
            v = data_sorted.pop(0)[0]
            
            # check if 'v' is less than 'h' away from some landmark
            if not data[v]['active']:
                continue
                
            landmarks.append(v)
            count_distance(vertice=v, data=data, h=h)
            
    
    vertices = [i[0] for i in data_items]
    if h == -1:
        return [vertices[i] for i in random.sample(range(0,graph_size), number_of_landmarks)]
    
    
    if ranking == 'random':
        
        random.shuffle(vertices)
        
        while len(landmarks) < number_of_landmarks and vertices:
            v = vertices.pop(0)
            
            if not data[v]['active']:
                continue
            
            landmarks.append(v)
            count_distance(vertice=v, data=data, h=h)
    
    if ranking == 'closeness':

        number_of_seeds = int(
            number_of_landmarks + (graph_size - number_of_landmarks) / 2
        ) if number_of_landmarks >= (graph_size / 2) else number_of_landmarks * 2
        print('Number of seeds: ' + str(number_of_seeds))
        
        random_indexes = random.sample(range(0,graph_size), number_of_seeds)
        seeds = [vertices[i] for i in random_indexes]
            
        for seed in seeds:
            count_distance(vertice=seed, data=data, h=h, full=True)

        seeds_sorted = [
            i[0] for i in sorted(
                [(j[0], j[1]['centrality']) for j in data.items() if j[0] in seeds],
                key= lambda x: x[1],
                reverse=True
            )
        ]
        
        landmarks.append(seeds_sorted.pop(0))
        print(seeds_sorted)
        print('======================')
        print([data[i]['active'] for i in seeds_sorted])
        print('======================')
        print([(key, value['active']) for key, value in data.items()])
        while len(landmarks) < number_of_landmarks:
            
            if not seeds_sorted:
                for key, value in data.items():
                    if len(landmarks) >= number_of_landmarks:
                        break
                    if value['active'] and key not in landmarks:
                        count_distance(vertice=key, data=data, h=h, full=True)
                        landmarks.append(key)
                break
            
            v = seeds_sorted.pop(0)
            print(str(v) + ': ' + str(data[v]['active']))
            if not data[v]['active']:
                continue
            
            landmarks.append(v) 
            
            
                    
        
    # roll back data
#     for key, value in data.items():
#         value['active'] = True
                
    return landmarks    

In [18]:
landmarks = select_landmarks(data, 0.05, 'closeness')
print(landmarks)

Graph size: 438
Number of seeds: 42
[59444, 106833, 15406, 31334, 38861, 69935, 93661, 102617, 106954, 60310, 125345, 4893, 31235, 11050, 35729, 42512, 42994, 44011, 50991, 52870, 65114, 68639, 115344, 19325, 50897, 53681, 89131, 93296, 107829, 130825, 30422, 39696, 69493, 76706, 77019, 113726, 122495, 125226, 21937, 26713, 16400]
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
[(84424, False), (276, True), (1662, True), (5089, True), (6058, True), (6229, True), (10639, True), (16442, True), (19325, False), (19834, True), (20113, True), (21937, False), (25452, True), (26902, True), (29829, True), (30222, True), (32432, True), (33040, True), (39238, True), (39521, True), (41418, True), (45009, True), (45098, True), (45242, True), (47005, True), (47968

In [14]:
print([(key, value['active']) for key, value in data.items() if key in landmarks])

[(276, False), (5089, False), (6058, False), (6229, False), (10639, False), (19325, False), (19834, False), (21937, False), (25452, False), (26902, False), (29829, False), (30222, False), (32432, False), (33040, False), (39238, False), (39521, False), (41418, False), (45009, False), (45098, False), (45242, False), (51631, False)]


In [20]:
import random

data_items = data.items()
vertices = [i[0] for i in data_items]

graph_size = len(data_items)
percent_of_landmarks = 10
number_of_landmarks = int(graph_size * (percent_of_landmarks / 100))

random_indexes = random.sample(range(0,graph_size), number_of_landmarks)
landmarks = [vertices[i] for i in random_indexes]

print(graph_size)
print(number_of_landmarks)
print(landmarks)

186
18
[125345, 63769, 50991, 45098, 95531, 62496, 37861, 69582, 276, 45009, 32432, 112243, 95860, 72233, 68320, 55528, 94235, 28787]


# Select landmarks using their degree

In [3]:
data_sorted = sorted(data.items(), key=lambda x: x[1]['degree'], reverse=True)
graph_size = len(data_sorted)

percent_of_landmarks = 10
number_of_landmarks = int(graph_size * (percent_of_landmarks / 100))

landmarks = [i[0] for i in data_sorted[:number_of_landmarks]]

print(graph_size)
print(number_of_landmarks)
print(landmarks)
# print(data_sorted)

186
18
[94138, 84424, 89308, 63225, 73543, 21937, 61571, 64124, 89994, 104802, 109016, 276, 1662, 5089, 6058, 6229, 10639, 16442]


# Select landmarks using their closeness centrality

In [5]:
import random

data_items = data.items()
vertices = [i[0] for i in data_items]

graph_size = len(data_items)

percent_of_seeds = 20
number_of_seeds = int(graph_size * (percent_of_seeds / 100))
print('Number of seeds: ' + str(number_of_seeds))

percent_of_landmarks = 10
assert percent_of_landmarks <= percent_of_seeds
number_of_landmarks = int(graph_size * (percent_of_landmarks / 100))
print('Number of landmarks: ' + str(number_of_landmarks))

# select random seeds from input graph to calculate their closeness centrality
random_indexes = random.sample(range(0,graph_size), number_of_seeds)
seeds = [vertices[i] for i in random_indexes]

for seed in seeds:
    count_distance(seed, data)

landmarks = [
    i[0] for i in sorted(
        [(j[0], j[1]['centrality']) for j in data.items() if j[0] in seeds],
        key= lambda x: x[1],
        reverse=True
    )[:number_of_landmarks]
]
print('Total number of landmarks: ' + str(len(landmarks)))
print(landmarks)
# for l in landmarks:
#     print(str(l) + ":")
#     print(data[l])

Number of seeds: 37
Number of landmarks: 18
Total number of landmarks: 18
[68320, 92552, 113138, 125345, 10639, 30222, 39238, 45242, 58458, 132445, 44011, 56117, 61472, 70618, 77580, 120454, 109016, 23579]


In [15]:
print(data)

{84424: {'linked': [276, 1662, 5089, 6058, 6229, 10639, 16442, 19325, 19834, 20113, 21937, 25452, 26902, 29829, 30222, 32432, 33040, 39238, 39521, 41418, 45009, 45098, 45242, 47005, 47968, 47999, 49934, 50220, 50897, 51730, 53681, 57537, 58458, 59326, 61571, 63552, 64124, 64568, 66200, 69839, 72391, 73543, 76259, 77098, 77915, 78627, 83560, 85420, 88768, 89131, 89308, 89994, 90506, 91060, 92387, 93296, 94138, 94329, 95070, 95531, 96570, 97101, 98506, 99104, 104802, 106611, 107829, 109016, 112605, 117751, 122908, 124023, 125190, 130825, 132445], 'distances': {39238: 1, 77580: 2, 61472: 2, 45242: 1, 132445: 1, 44011: 2, 125345: 2, 120454: 2, 70618: 2, 58458: 1, 92552: 2, 10639: 1, 68320: 2, 30222: 1, 113138: 2, 109016: 1, 56117: 2}, 'degree': 75, 'marked': False, 'centrality': 0}, 276: {'linked': [84424], 'distances': {39238: 2, 77580: 3, 61472: 3, 45242: 2, 132445: 2, 44011: 3, 125345: 3, 120454: 3, 70618: 3, 58458: 2, 92552: 3, 10639: 2, 68320: 3, 30222: 2, 113138: 3, 109016: 2, 56117:

In [27]:
print(data[84424]['distances'])
print(data[44011]['distances'])

{39238: 1, 77580: 2, 61472: 2, 45242: 1, 132445: 1, 44011: 2, 125345: 2, 120454: 2, 70618: 2, 58458: 1, 92552: 2, 10639: 1, 68320: 2, 30222: 1, 113138: 2, 109016: 1, 56117: 2}
{39238: 3, 77580: 2, 61472: 2, 45242: 3, 132445: 3, 44011: 0, 125345: 3, 120454: 2, 70618: 2, 58458: 3, 92552: 3, 10639: 3, 68320: 3, 30222: 3, 113138: 3, 109016: 2, 56117: 2}


In [25]:
def shortest_path(source, stock, landmarks, data):
    """
    Counts distance from source to stock using landmarks.
    For distance estimation geometric mean is used.
    """
    source_distances = data[source]['distances']
    stock_distances = data[stock]['distances']
    
    L = -1
    U = 3 * graph_size
    for key, value in source_distances.items():
        
        temp = stock_distances.get(key, -1)
        # cheks if there are distance from stock to 'key' landmark
        if temp == -1:
            continue
        
        l = abs(value - temp)
        u = value + temp
        
        if l > L:
            L = l
        if u < U:
            U = u
            
    if L == -1 and U == 3 * graph_size:
        # this mean that source and stock are in different connectivity components
        return -1
            
    return (L * U) ** 0.5

In [29]:
print(shortest_path(84424, 276, landmarks, data))

2.0


In [28]:
print(bfs(84424, 276, data))

2


In [None]:
d = {
    1: {
        'linked': [2,3,4,10],
        'distances':{},
        'marked': False
    },
    2: {
        'linked': [1,5,6,7],
        'distances':{},
        'marked': False
    },
    3: {
        'linked': [1,8],
        'distances':{},
        'marked': False
    },
    4: {
        'linked': [1,9,10],
        'distances':{},
        'marked': False
    },
    5: {
        'linked': [2],
        'distances':{},
        'marked': False
    },
    6: {
        'linked': [2],
        'distances':{},
        'marked': False
    },
    7: {
        'linked': [2],
        'distances':{},
        'marked': False
    },
    8: {
        'linked': [3],
        'distances':{},
        'marked': False
    },
    9: {
        'linked': [4],
        'distances':{},
        'marked': False
    },
    10: {
        'linked': [4,1],
        'distances':{},
        'marked': False
    }
}

print(bfs(1, 3, d))