In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
from textblob import TextBlob
import itertools

In [125]:
os.path.exists("D:\\graphs\\outputacm.txt")

True

In [2]:
lauri_path = "outputacm.txt"
test = open(lauri_path, 'r',  encoding="utf8") 
test.readlines()[0:100]

['629814\n',
 '#*Automated Deduction in Geometry: 5th International Workshop, ADG 2004, Gainesville, FL, USA, September 16-18, 2004, Revised Papers (Lecture Notes in Computer ... / Lecture Notes in Artificial Intelligence)\n',
 '#@Hoon Hong,Dongming Wang\n',
 '#t2006\n',
 '#c\n',
 '#index0\n',
 '\n',
 '#*A+ Certification Core Hardware (Text & Lab Manual)\n',
 '#@Charles J. Brooks\n',
 '#t2003\n',
 '#c\n',
 '#index1\n',
 '\n',
 '#*Performance engineering in industry: current practices and adoption challenges\n',
 '#@Ahmed E. Hassan,Parminder Flora\n',
 '#t2007\n',
 '#cProceedings of the 6th international workshop on Software and performance\n',
 '#index2\n',
 '#!This panel session discusses performance engineering practices in industry. Presentations in the session will explore the use of lightweight techniques and approaches in order to permit the cost effective and rapid adoption of performance modeling research by large industrial software systems.\n',
 '\n',
 '#*Dude, You Can Do It!

In [3]:
G = nx.DiGraph()

In [4]:
def create_graph(reference):
    G.add_node(reference['index']) # add node
    G.add_node(reference['index'], title=reference['title']) # add title attribute to node
    G.add_node(reference['index'], journal=reference['journal']) # add journal attribute to node
    if 'abstract' in reference:
        G.add_node(reference['index'], abstract=reference['abstract']) # add abstract attribute to node
    for citation in reference['citations']:
        G.add_edge(reference['index'], citation) # add edge to node

In [5]:
def parse_data():
    with open(lauri_path, 'r',  encoding="utf8") as f:
        reference={}
        citations=[]
        readFile = f.readlines()
        for line in readFile:
            
            if '#*' in line: # article title
                if (bool(reference)):
                    reference['citations'] = citations
                    citations=[]
                    create_graph(reference)
                    try:
                        yield reference
                    except IndexError as e:
                        continue
                    reference={}
                reference['title'] = line[2:].rstrip()
            elif '#@' in line: # authors
                reference['author'] = line[2:].rstrip().rsplit(";")
            elif '#t' in line: # year published
                reference['year'] = line[2:].rstrip()
            elif '#c' in line: # journal
                reference['journal'] = line[2:].rstrip()
            elif '#index' in line: # index
                reference['index'] = line[6:].rstrip()
            elif '#%' in line: # id of cited paper
                citations.append(line[2:].rstrip())
            elif '#!' in line: # abstract
                reference['abstract'] = line[2:].rstrip()
                if (bool(reference)):
                    reference['citations'] = citations
                    citations=[]
                    create_graph(reference)
                    try:
                        yield reference
                    except IndexError as e:
                        continue
        create_graph(reference)
        return reference

In [6]:
data = pd.DataFrame(parse_data(), columns =('index', 'title', 'author',
                                            'year', 'journal', 'citations', 'abstract'))

In [7]:
print(G.number_of_nodes(), 'nodes')
print(G.size(), 'edges')

629814 nodes
632751 edges


In [8]:
data_top = data.head() 
data_top  

Unnamed: 0,index,title,author,year,journal,citations,abstract
0,0,Automated Deduction in Geometry: 5th Internati...,"[Hoon Hong,Dongming Wang]",2006,,[],
1,1,A+ Certification Core Hardware (Text & Lab Man...,[Charles J. Brooks],2003,,[],
2,2,Performance engineering in industry: current p...,"[Ahmed E. Hassan,Parminder Flora]",2007,Proceedings of the 6th international workshop ...,[],This panel session discusses performance engin...
3,2,Performance engineering in industry: current p...,"[Ahmed E. Hassan,Parminder Flora]",2007,Proceedings of the 6th international workshop ...,[],This panel session discusses performance engin...
4,3,"Dude, You Can Do It! How to Build a Sweeet PC","[Darrel Creacy,Carlito Vicencio]",2005,,[],Whether you're frustrated with current PC offe...


In [133]:
G.is_directed()

True

In [134]:
data.shape

(910892, 7)

In [9]:
data.dropna(subset=['index'])

Unnamed: 0,index,title,author,year,journal,citations,abstract
0,0,Automated Deduction in Geometry: 5th Internati...,"[Hoon Hong,Dongming Wang]",2006,,[],
1,1,A+ Certification Core Hardware (Text & Lab Man...,[Charles J. Brooks],2003,,[],
2,2,Performance engineering in industry: current p...,"[Ahmed E. Hassan,Parminder Flora]",2007,Proceedings of the 6th international workshop ...,[],This panel session discusses performance engin...
3,2,Performance engineering in industry: current p...,"[Ahmed E. Hassan,Parminder Flora]",2007,Proceedings of the 6th international workshop ...,[],This panel session discusses performance engin...
4,3,"Dude, You Can Do It! How to Build a Sweeet PC","[Darrel Creacy,Carlito Vicencio]",2005,,[],Whether you're frustrated with current PC offe...
...,...,...,...,...,...,...,...
910887,629810,Review article,[],2008,Communications of the ACM,[],
910888,629811,Multimodal system evaluation using modality ef...,"[Manolis Perakakis,Alexandros Potamianos]",2008,Proceedings of the 10th international conferen...,[],"In this paper, we propose two new objective me..."
910889,629811,Multimodal system evaluation using modality ef...,"[Manolis Perakakis,Alexandros Potamianos]",2008,Proceedings of the 10th international conferen...,[],"In this paper, we propose two new objective me..."
910890,629812,Computer System Architecture,[V. K. Jain],2007,,[],


In [10]:
data.shape

(910892, 7)

In [11]:
G_sorted = sorted(G.degree, key=lambda x: x[1], reverse=True)

In [12]:
len(G_sorted)

629814

In [13]:
G_sorted[0:5]

[('453387', 816),
 ('162585', 791),
 ('214951', 648),
 ('151297', 588),
 ('326368', 568)]

In [140]:
G_sorted[2][0]

'214951'

In [141]:
print(G.number_of_nodes(), 'nodes')
print(G.size(), 'edges')

629814 nodes
632751 edges


In [58]:

print(G.in_degree('151297'))


588
3054
5329
25416
49640
50960
53648
53720
54995
62185
69282
74786
75636
75854
76229
77108
80365
80467
82571
84027
84802
85273
86059
86261
86330
88638
90703
91278
91368
91398
91645
92541
93212
94339
94862
96497
96719
99934
108081
119771
121208
123456
126248
137230
141172
142837
143056
143122
143849
144348
144431
144448
144562
144746
145670
145921
146286
146403
146561
146606
147074
147375
147402
147552
147625
148158
149334
149714
149776
150506
151117
151602
151679
152032
152045
152224
152299
152334
152884
152936
153269
153375
153485
153873
154025
154514
154983
155065
155187
155411
155941
156198
156734
156916
157380
157829
157831
157923
158278
158306
158361
158718
158818
158902
159010
159420
159507
159971
160120
160433
160469
160734
161447
161519
161793
162499
162605
162915
163088
163238
163293
163298
163393
163419
164772
165077
165195
165397
165625
165894
166010
166393
166833
167134
167155
167236
167495
168230
168340
168580
168601
168954
169183
169713
170010
170890
171066
171225
171669

In [78]:
G.nodes['77108'].keys()

dict_keys(['title', 'journal'])

In [16]:
TextBlob(G.nodes[G_sorted[3][0]]['abstract']).noun_phrases

WordList(['preface', 'front matter', 'preface', 'advances', 'computer hardware', 'direct contact', 'similar advances', 'computer software', 'smalltalk-80', 'computer software', 'interactive contact', 'personal computer systems', 'smalltalk-80', 'major parts', 'part', 'part', "system 's functionality", 'part', 'moderate-size application', 'part', 'smalltalk-80', 'virtual machine'])

In [20]:
#wordlist is the wordlist of the central node and comparison is wordlist of one of it's neighbor
def shared_noun_phrases(wordlist, comparison):
    count=0
    for word in wordlist:
        if word in comparison:
            count+=1
    return count/len(wordlist)
            

In [17]:
def find_noun_phrases(graph, starting_node):
    noun_phrases= TextBlob(G.nodes[starting_node]['abstract']).noun_phrases
    data_set=[noun_phrases]
    for neighbor in graph.predecessors(starting_node):
        if 'abstract' in G.nodes[neighbor].keys():
            data_set.append(shared_noun_phrases(noun_phrases, TextBlob(G.nodes[neighbor]['abstract']).noun_phrases))
    return data_set

In [21]:
top_ten_citations=[]
i=0
k=0
while k<10:
    if 'abstract' in G.nodes[G_sorted[i][0]].keys():
        top_ten_citations.append(find_noun_phrases(G, G_sorted[i][0]))
        k+=1
    i+=1