In [29]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
from textblob import TextBlob
import itertools

In [12]:
os.path.exists("outputacm.txt")
lauri_path = "D:\\graphs\\outputacm.txt"
test = open(lauri_path, 'r',  encoding="utf8") 
test.readlines()[0:100]

['629814\n',
 '#*Automated Deduction in Geometry: 5th International Workshop, ADG 2004, Gainesville, FL, USA, September 16-18, 2004, Revised Papers (Lecture Notes in Computer ... / Lecture Notes in Artificial Intelligence)\n',
 '#@Hoon Hong,Dongming Wang\n',
 '#t2006\n',
 '#c\n',
 '#index0\n',
 '\n',
 '#*A+ Certification Core Hardware (Text & Lab Manual)\n',
 '#@Charles J. Brooks\n',
 '#t2003\n',
 '#c\n',
 '#index1\n',
 '\n',
 '#*Performance engineering in industry: current practices and adoption challenges\n',
 '#@Ahmed E. Hassan,Parminder Flora\n',
 '#t2007\n',
 '#cProceedings of the 6th international workshop on Software and performance\n',
 '#index2\n',
 '#!This panel session discusses performance engineering practices in industry. Presentations in the session will explore the use of lightweight techniques and approaches in order to permit the cost effective and rapid adoption of performance modeling research by large industrial software systems.\n',
 '\n',
 '#*Dude, You Can Do It!

In [13]:
G = nx.DiGraph()

In [60]:
def create_graph(reference):
    G.add_node(reference['index']) # add node
    G.add_node(reference['index'], title=reference['title']) # add title attribute to node
    G.add_node(reference['index'], journal=reference['journal']) # add journal attribute to node
    G.add_node(reference['index'], abstract=reference['abstract']) # add abstract attribute to node
    for citation in reference['citations']:
        G.add_edge(reference['index'], citation) # add edge to node

In [61]:
def parse_data():
    with open(lauri_path, 'r',  encoding="utf8") as f:
        reference={}
        citations=[]
        readFile = f.readlines()
        for line in readFile:
            
            if '#*' in line: # article title
                if (bool(reference)):
                    citations=[]
                    reference={}
                reference['title'] = line[2:].rstrip()
            elif '#@' in line: # authors
                reference['author'] = line[2:].rstrip().rsplit(";")
            elif '#t' in line: # year published
                reference['year'] = line[2:].rstrip()
            elif '#c' in line: # journal
                reference['journal'] = line[2:].rstrip()
            elif '#index' in line: # index
                reference['index'] = line[6:].rstrip()
            elif '#%' in line: # id of cited paper
                citations.append(line[2:].rstrip())
            elif '#!' in line: # abstract
                reference['abstract'] = line[2:].rstrip()
                if (bool(reference)):
                    reference['citations'] = citations
                    citations=[]
                    create_graph(reference)
                    try:
                        yield reference
                    except IndexError as e:
                        continue
                reference={}
        return reference

In [62]:
data = pd.DataFrame(parse_data(), columns =('index', 'title', 'author',
                                            'year', 'journal', 'citations', 'abstract'))

In [63]:
print(G.number_of_nodes(), 'nodes')
print(G.size(), 'edges')

629814 nodes
632751 edges


In [64]:
data_top = data.head() 
data_top  

Unnamed: 0,index,title,author,year,journal,citations,abstract
0,2,Performance engineering in industry: current p...,"[Ahmed E. Hassan,Parminder Flora]",2007,Proceedings of the 6th international workshop ...,[],This panel session discusses performance engin...
1,3,"Dude, You Can Do It! How to Build a Sweeet PC","[Darrel Creacy,Carlito Vicencio]",2005,,[],Whether you're frustrated with current PC offe...
2,5,Interpreting Kullback-Leibler divergence with ...,"[Shinto Eguchi,John Copas]",2006,Journal of Multivariate Analysis,[436405],Kullback-Leibler divergence and the Neyman-Pea...
3,7,TOPP---the OpenMS proteomics pipeline,"[Oliver Kohlbacher,Knut Reinert,Clemens Gröpl,...",2007,Bioinformatics,[],Motivation: Experimental techniques in proteom...
4,12,"Webbots, Spiders, and Screen Scrapers","[Michael Schrenk,Michael Shrenk]",2007,,[],The Internet is bigger and better than what a ...


In [65]:
G.is_directed()

True

In [66]:
data.shape

(281079, 7)

In [67]:
data.dropna(subset=['index'])

Unnamed: 0,index,title,author,year,journal,citations,abstract
0,2,Performance engineering in industry: current p...,"[Ahmed E. Hassan,Parminder Flora]",2007,Proceedings of the 6th international workshop ...,[],This panel session discusses performance engin...
1,3,"Dude, You Can Do It! How to Build a Sweeet PC","[Darrel Creacy,Carlito Vicencio]",2005,,[],Whether you're frustrated with current PC offe...
2,5,Interpreting Kullback-Leibler divergence with ...,"[Shinto Eguchi,John Copas]",2006,Journal of Multivariate Analysis,[436405],Kullback-Leibler divergence and the Neyman-Pea...
3,7,TOPP---the OpenMS proteomics pipeline,"[Oliver Kohlbacher,Knut Reinert,Clemens Gröpl,...",2007,Bioinformatics,[],Motivation: Experimental techniques in proteom...
4,12,"Webbots, Spiders, and Screen Scrapers","[Michael Schrenk,Michael Shrenk]",2007,,[],The Internet is bigger and better than what a ...
...,...,...,...,...,...,...,...
281074,629806,Effectiveness and usability of an online help ...,"[Jérôme Simonin,Noëlle Carbonell,Danielle Pelé]",2008,Proceedings of the 10th international conferen...,"[8543, 327540, 395578, 397153, 398612]",An empirical study is presented which aims at ...
281075,629807,Busy period analysis of finite QBD processes,"[Chaitanya Garikiparthi,Appie van de Liefvoort...",2008,ACM SIGMETRICS Performance Evaluation Review,[340965],We present the number of customers served and ...
281076,629808,The Grid as a Single Entity: Towards a Behavio...,"[Jesús Montes,Alberto Sánchez,Julio J. Valdés,...",2008,Proceedings of the OTM 2008 Confederated Inter...,[],Grids emerged in the last decade as large dist...
281077,629811,Multimodal system evaluation using modality ef...,"[Manolis Perakakis,Alexandros Potamianos]",2008,Proceedings of the 10th international conferen...,"[294663, 302639, 572828]","In this paper, we propose two new objective me..."


In [68]:
data.shape

(281079, 7)

In [69]:
G_sorted = sorted(G.degree, key=lambda x: x[1], reverse=True)

In [70]:
len(G_sorted)

629814

In [71]:
G_sorted[0:5]

[('453387', 816),
 ('162585', 791),
 ('214951', 648),
 ('151297', 588),
 ('326368', 568)]

In [78]:
G_sorted[2][0]

'162585'

In [81]:
G.nodes[G_sorted[3][0]]

{'title': 'Smalltalk-80: the language and its implementation',
 'journal': '',
 'abstract': "From the Preface (See Front Matter for full Preface) Advances in the design and production of computer hardware have brought many more people into direct contact with computers. Similar advances in the design and production of computer software are required in order that this increased contact be as rewarding as possible. The Smalltalk-80 system is a result of a decade of research into creating computer software that is appropriate for producing highly functional and interactive contact with personal computer systems. This book is the first detailed account of the Smalltalk-80 system. It is divided into four major parts: Part One -- an overview of the concepts and syntax of the programming language. Part Two -- an annotated and illustrated specification of the system's functionality. Part Three -- an example of the design and implementation of a moderate-size application. Part Four -- a specifi

In [83]:
G.nodes[G_sorted[3][0]]['abstract']

"From the Preface (See Front Matter for full Preface) Advances in the design and production of computer hardware have brought many more people into direct contact with computers. Similar advances in the design and production of computer software are required in order that this increased contact be as rewarding as possible. The Smalltalk-80 system is a result of a decade of research into creating computer software that is appropriate for producing highly functional and interactive contact with personal computer systems. This book is the first detailed account of the Smalltalk-80 system. It is divided into four major parts: Part One -- an overview of the concepts and syntax of the programming language. Part Two -- an annotated and illustrated specification of the system's functionality. Part Three -- an example of the design and implementation of a moderate-size application. Part Four -- a specification of the Smalltalk-80 virtual machine."

In [86]:
TextBlob(G.nodes[G_sorted[3][0]]['abstract']).noun_phrases

WordList(['preface', 'front matter', 'preface', 'advances', 'computer hardware', 'direct contact', 'similar advances', 'computer software', 'smalltalk-80', 'computer software', 'interactive contact', 'personal computer systems', 'smalltalk-80', 'major parts', 'part', 'part', "system 's functionality", 'part', 'moderate-size application', 'part', 'smalltalk-80', 'virtual machine'])