In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import itertools

In [3]:
os.path.exists("outputacm.txt")
 
test = open(r"outputacm.txt", 'r',  encoding="utf8") 
test.readlines()[0:100]

['629814\n',
 '#*Automated Deduction in Geometry: 5th International Workshop, ADG 2004, Gainesville, FL, USA, September 16-18, 2004, Revised Papers (Lecture Notes in Computer ... / Lecture Notes in Artificial Intelligence)\n',
 '#@Hoon Hong,Dongming Wang\n',
 '#t2006\n',
 '#c\n',
 '#index0\n',
 '\n',
 '#*A+ Certification Core Hardware (Text & Lab Manual)\n',
 '#@Charles J. Brooks\n',
 '#t2003\n',
 '#c\n',
 '#index1\n',
 '\n',
 '#*Performance engineering in industry: current practices and adoption challenges\n',
 '#@Ahmed E. Hassan,Parminder Flora\n',
 '#t2007\n',
 '#cProceedings of the 6th international workshop on Software and performance\n',
 '#index2\n',
 '#!This panel session discusses performance engineering practices in industry. Presentations in the session will explore the use of lightweight techniques and approaches in order to permit the cost effective and rapid adoption of performance modeling research by large industrial software systems.\n',
 '\n',
 '#*Dude, You Can Do It!

In [4]:
G = nx.DiGraph()

In [6]:
def create_graph(reference):
    G.add_node(reference['index']) # add node
    G.add_node(reference['index'], title=reference['title']) # add title attribute to node
    G.add_node(reference['index'], journal=reference['journal']) # add journal attribute to node
    for citation in reference['citations']:
        G.add_edge(reference['index'], citation) # add edge to node

In [9]:
def parse_data():
    with open(r'outputacm.txt', 'r',  encoding="utf8") as f:
        reference={}
        citations=[]
        readFile = f.readlines()
        for line in readFile:
            
            if '#*' in line: # article title
                if (bool(reference)):
                    reference['citations'] = citations
                    citations=[]
                    create_graph(reference)
                    try:
                        yield reference
                    except IndexError as e:
                        continue
                    reference={}
                reference['title'] = line[2:].rstrip()
            elif '#@' in line: # authors
                reference['author'] = line[2:].rstrip().rsplit(";")
            elif '#t' in line: # year published
                reference['year'] = line[2:].rstrip()
            elif '#c' in line: # journal
                reference['journal'] = line[2:].rstrip()
            elif '#index' in line: # index
                reference['index'] = line[6:].rstrip()
            elif '#%' in line: # id of cited paper
                citations.append(line[2:].rstrip())
            elif '#!' in line: # abstract
                reference['abstract'] = line[2:].rstrip()
        return reference

In [10]:
data = pd.DataFrame(parse_data(), columns =('index', 'title', 'author',
                                            'year', 'journal', 'citations', 'abstract'))

In [11]:
print(G.number_of_nodes(), 'nodes')
print(G.size(), 'edges')

629813 nodes
632751 edges


In [12]:
data_top = data.head() 
data_top  

Unnamed: 0,index,title,author,year,journal,citations,abstract
0,0,Automated Deduction in Geometry: 5th Internati...,"[Hoon Hong,Dongming Wang]",2006,,[],
1,1,A+ Certification Core Hardware (Text & Lab Man...,[Charles J. Brooks],2003,,[],
2,2,Performance engineering in industry: current p...,"[Ahmed E. Hassan,Parminder Flora]",2007,Proceedings of the 6th international workshop ...,[],This panel session discusses performance engin...
3,3,"Dude, You Can Do It! How to Build a Sweeet PC","[Darrel Creacy,Carlito Vicencio]",2005,,[],Whether you're frustrated with current PC offe...
4,4,What Every Programmer Needs to Know about Secu...,"[Neil Daswani,Anita Kesavan]",2006,,[],


In [9]:
G.is_directed()

True

In [10]:
data.shape

(629813, 7)

In [28]:
data.dropna(subset=['abstract'])

Unnamed: 0,index,title,author,year,journal,citations,abstract
2,2,Performance engineering in industry: current p...,"[Ahmed E. Hassan,Parminder Flora]",2007,Proceedings of the 6th international workshop ...,[],This panel session discusses performance engin...
3,3,"Dude, You Can Do It! How to Build a Sweeet PC","[Darrel Creacy,Carlito Vicencio]",2005,,[],Whether you're frustrated with current PC offe...
5,5,Interpreting Kullback-Leibler divergence with ...,"[Shinto Eguchi,John Copas]",2006,Journal of Multivariate Analysis,[436405],Kullback-Leibler divergence and the Neyman-Pea...
7,7,TOPP---the OpenMS proteomics pipeline,"[Oliver Kohlbacher,Knut Reinert,Clemens Gröpl,...",2007,Bioinformatics,[],Motivation: Experimental techniques in proteom...
12,12,"Webbots, Spiders, and Screen Scrapers","[Michael Schrenk,Michael Shrenk]",2007,,[],The Internet is bigger and better than what a ...
...,...,...,...,...,...,...,...
629804,629804,SENTINEL: a semantic business process monitori...,"[Carlos Pedrinaci,Dave Lambert,Branimir Wetzst...",2008,Proceedings of the first international worksho...,"[12156, 29272, 29779, 88763, 261856, 340817, 4...",Business Activity Monitoring (BAM) aims to sup...
629806,629806,Effectiveness and usability of an online help ...,"[Jérôme Simonin,Noëlle Carbonell,Danielle Pelé]",2008,Proceedings of the 10th international conferen...,"[8543, 327540, 395578, 397153, 398612]",An empirical study is presented which aims at ...
629807,629807,Busy period analysis of finite QBD processes,"[Chaitanya Garikiparthi,Appie van de Liefvoort...",2008,ACM SIGMETRICS Performance Evaluation Review,[340965],We present the number of customers served and ...
629808,629808,The Grid as a Single Entity: Towards a Behavio...,"[Jesús Montes,Alberto Sánchez,Julio J. Valdés,...",2008,Proceedings of the OTM 2008 Confederated Inter...,[],Grids emerged in the last decade as large dist...


In [13]:
G.remove_nodes_from(list(nx.isolates(G)))
print(G.number_of_nodes())

217335


In [27]:
k = 2500

communities=nx.algorithms.community.centrality.girvan_newman(G)

tuple(sorted(c) for c in next(communities))



KeyboardInterrupt: 

In [15]:
print(limited)

<itertools.takewhile object at 0x7f3a3c63f690>


In [36]:
def check_distance(G, start, end, distance):
    edges=list(nx.bfs_edges(G, start, depth_limit=distance))
    nodes = [start] + [v for u, v in edges]
    return end in nodes

In [43]:
x= nx.path_graph(10)

print(x)
print(check_distance(x, 0, 5, 5))


True
