In [6]:
import csv

def parse_txt(filename, oriented=True):
    """
    Parse data from txt file into dict python type.
    JSON serializable.
    """
    data = {}
    with open(filename) as file:
        
        line = file.readline()
        while line:
            
            # skip comments
            if line[0] == '#':
                line = file.readline()
                continue
            
            parent, child = line.split()
            parent = int(parent)
            child = int(child)
            
            # rows in data file can be duplicated
            if parent in data:
                if child not in data[parent]['linked']:
                    data[parent]['linked'].append(child)
                    data[parent]['degree'] += 1
            else:
                data[parent] = { 
                    'linked': [child],
                    'distances': {},
                    'degree': 1,
                    'centrality': 0,
                    'marked': False,
                    'active': True
                }
                
            if oriented:
                if child not in data:
                    data[child] = { 
                    'linked': [],
                    'distances': {},
                    'degree': 1,
                    'centrality': 0,
                    'marked': False,
                    'active': True
                }
                
            else:
                if child in data:
                    if parent not in data[child]['linked']:
                        data[child]['linked'].append(parent)
                        data[child]['degree'] += 1

                else:    
                    data[child] = {
                        'linked': [parent],
                        'distances': {},
                        'degree': 1,
                        'centrality': 0,
                        'marked': False,
                        'active': True
                    }

            line = file.readline()

    return data

def parse_csv(filename, oriented=True):
    data = {}
    
    with open(filename) as file:
        reader = csv.reader(file)
        next(reader)
        
        for row in reader:
            
            parent = int(row[0])
            child = int(row[1])
            
            if parent in data:
                if child not in data[parent]['linked']:
                    data[parent]['linked'].append(child)
                    data[parent]['degree'] += 1
            else:
                data[parent] = { 
                    'linked': [child],
                    'distances': {},
                    'degree': 1,
                    'centrality': 0,
                    'marked': False,
                    'active': True
                }
                
            if oriented:
                if child not in data:
                    data[child] = { 
                    'linked': [],
                    'distances': {},
                    'degree': 1,
                    'centrality': 0,
                    'marked': False,
                    'active': True
                }
                
            else:
                if child in data:
                    if parent not in data[child]['linked']:
                        data[child]['linked'].append(parent)
                        data[child]['degree'] += 1

                else:    
                    data[child] = {
                        'linked': [parent],
                        'distances': {},
                        'degree': 1,
                        'centrality': 0,
                        'marked': False,
                        'active': True
                    }
                    
    return data

def parse(filename, oriented=True):
    if filename.split('.')[-1] == 'txt':
        return parse_txt(filename, oriented)
    elif filename.split('.')[-1] == 'csv':
        return parse_csv(filename, oriented)

In [56]:
FILENAME = 'test-google.txt'
ORIENTED = False

In [57]:
data = parse(FILENAME, ORIENTED)
print(data)

{0: {'linked': [11342, 824020, 867923, 891835], 'distances': {}, 'degree': 4, 'centrality': 0, 'marked': False, 'active': True}, 11342: {'linked': [0, 27469, 38716, 309564, 322178, 387543, 427436, 538214, 638706, 645018, 835220, 856657, 867923, 891835], 'distances': {}, 'degree': 14, 'centrality': 0, 'marked': False, 'active': True}, 824020: {'linked': [0, 91807, 322178, 387543, 417728, 438493, 500627, 535748, 695578, 867923, 891835], 'distances': {}, 'degree': 11, 'centrality': 0, 'marked': False, 'active': True}, 867923: {'linked': [0, 11342, 824020, 136593, 414038, 500627, 523684, 760842, 815602, 835220, 846213, 857527, 891835], 'distances': {}, 'degree': 13, 'centrality': 0, 'marked': False, 'active': True}, 891835: {'linked': [0, 11342, 824020, 867923, 112028, 235849, 302284, 417728, 451592, 693969, 857527], 'distances': {}, 'degree': 11, 'centrality': 0, 'marked': False, 'active': True}, 27469: {'linked': [11342], 'distances': {}, 'degree': 1, 'centrality': 0, 'marked': False, 'a

In [58]:
def count_vertices(graph):
    return len(graph)

def count_edges(graph):
    edges = 0
    for item in graph.values():
        edges += item['degree']
    return edges / 2

vertices = count_vertices(data)
edges = count_edges(data)
complete_graph_edges = vertices * (vertices - 1) / 2

print(f'Number of vertices in {FILENAME}: {vertices}')
print(f'Number of edges in {FILENAME}: {edges}')
print(f'Number of edges in complete graph: {complete_graph_edges}')
print(f'Density: {edges / complete_graph_edges}')

Number of vertices in test-google.txt: 4183
Number of edges in test-google.txt: 8032.0
Number of edges in complete graph: 8746653.0
Density: 0.0009182941177613883


In [59]:
def dfs(graph, start, visited=None):
    if visited is None:
        visited = set()
    visited.add(start)
    for next in set(graph[start]["linked"]) - visited:
        dfs(graph, next, visited)
    return visited

def get_weak_connectivity_components(graph):
    weak_connectivity_components = []
    for item in graph.keys():
        component = dfs(graph,item)
        if component not in weak_connectivity_components:
            weak_connectivity_components.append(component)
    return weak_connectivity_components

def get_max_weak_connectivity_component_size(components):
    maximum = 0
    for item in components:
        if len(item) > maximum:
            maximum = len(item)
    return maximum

weak_connectivity_components = get_weak_connectivity_components(data)
max_weak_connectivity_component_size = get_max_weak_connectivity_component_size(weak_connectivity_components)
print(f'Number of weak connectivity components in {FILENAME}: {len(weak_connectivity_components)}')
print(f'Proportion of vertices in max weak connectivity component: {max_weak_connectivity_component_size/vertices}')

Number of weak connectivity components in test-google.txt: 59
Proportion of vertices in max weak connectivity component: 0.3385130289266077


In [None]:
def find_path(graph, start, end, path=[]):
        path = path + [start]
        if start == end:
            return path
        if start not in graph:
            return None
        for node in graph[start]:
            if node not in path:
                newpath = find_path(graph, node, end, path)
                if newpath:
                    return newpath
        return None
    
def connect(x1,x2):
    return True if x1 != None and x2 != None else False

def Kosarai(graph):
    graphs_list = []
    for i in range(1,len(graph)+1):
        res_graph = []
        for j in range(1,len(graph)+1):
            graph1 = find_path(graph, str(i), str(j))
            graph2 = find_path(graph, str(j), str(i))
            if connect(graph1, graph2):
                if i!=j:
                    res_graph = graph1
        if len(res_graph)>2:
            graphs_list.append(res_graph)
    return graphs_list

strong_connectivity_components = Kosarai(data)