## Preprocessing

In [1]:
import pandas as pd
import re
import spacy

import torch
from torch_geometric.data import HeteroData

import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx

In [2]:
filename = 'data/arxiv-metadata-oai-snapshot-10000.csv'
df = pd.read_csv(filename)

In [3]:
def extract_pages(s):
    match = re.search(r"(\d+)\s*pages", s)
    if match:
        return int(match.group(1))
    else:
        return None

In [4]:
df['authors_parsed'] = df['authors_parsed'].apply(lambda x: [" ".join(i).strip() for i in eval(x)])
df['versions'] = df['versions'].apply(lambda x: eval(x)[0]["created"])
df['timestamp'] = pd.to_datetime(df['versions'], format="%a, %d %b %Y %H:%M:%S %Z")
df['timestamp'] = df['timestamp'].apply(lambda x: x.timestamp())
df["categories"] = df["categories"].apply(lambda x: x.split(" "))
df.drop(columns=["submitter", "versions", "update_date", "authors"], inplace=True)
df["pages"] = df.comments.apply(lambda x: extract_pages(str(x)))
df.head()

Unnamed: 0,id,title,comments,journal-ref,doi,report-no,categories,license,abstract,authors_parsed,timestamp,pages
0,704.0001,Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,[hep-ph],,A fully differential calculation in perturba...,"[Balázs C., Berger E. L., Nadolsky P. M., Yuan...",1175542000.0,37.0
1,704.0002,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,"[math.CO, cs.CG]",http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[Streinu Ileana, Theran Louis]",1175308000.0,
2,704.0003,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,[physics.gen-ph],,The evolution of Earth-Moon system is descri...,[Pan Hongjun],1175460000.0,23.0
3,704.0004,A determinant of Stirling cycle numbers counts...,11 pages,,,,[math.CO],,We show that a determinant of Stirling cycle...,[Callan David],1175311000.0,11.0
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,"[math.CA, math.FA]",,In this paper we show how to compute the $\L...,"[Abu-Shammala Wael, Torchinsky Alberto]",1175537000.0,


In [5]:
def tokenize_and_normalize(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [token.text.lower() for token in doc if not token.is_punct and not token.is_space]

def lemm(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_punct and not token.is_space]

In [6]:
df_short = df[:10]

In [7]:
tokenized_words_list = []
lemm_list = []


for index, row in df_short.iterrows():
    text = row['abstract']
    tokenized_words = tokenize_and_normalize(text)
    lemm_words = lemm(text)
    tokenized_words_list.append(tokenized_words)
    lemm_list.append(lemm_words)


print(tokenized_words_list)
print(lemm_list)


[['a', 'fully', 'differential', 'calculation', 'in', 'perturbative', 'quantum', 'chromodynamics', 'is', 'presented', 'for', 'the', 'production', 'of', 'massive', 'photon', 'pairs', 'at', 'hadron', 'colliders', 'all', 'next', 'to', 'leading', 'order', 'perturbative', 'contributions', 'from', 'quark', 'antiquark', 'gluon-(anti)quark', 'and', 'gluon', 'gluon', 'subprocesses', 'are', 'included', 'as', 'well', 'as', 'all', 'orders', 'resummation', 'of', 'initial', 'state', 'gluon', 'radiation', 'valid', 'at', 'next', 'to', 'next', 'to', 'leading', 'logarithmic', 'accuracy', 'the', 'region', 'of', 'phase', 'space', 'is', 'specified', 'in', 'which', 'the', 'calculation', 'is', 'most', 'reliable', 'good', 'agreement', 'is', 'demonstrated', 'with', 'data', 'from', 'the', 'fermilab', 'tevatron', 'and', 'predictions', 'are', 'made', 'for', 'more', 'detailed', 'tests', 'with', 'cdf', 'and', 'do', 'data', 'predictions', 'are', 'shown', 'for', 'distributions', 'of', 'diphoton', 'pairs', 'produced', 

In [8]:
print(tokenized_words_list[0])

['a', 'fully', 'differential', 'calculation', 'in', 'perturbative', 'quantum', 'chromodynamics', 'is', 'presented', 'for', 'the', 'production', 'of', 'massive', 'photon', 'pairs', 'at', 'hadron', 'colliders', 'all', 'next', 'to', 'leading', 'order', 'perturbative', 'contributions', 'from', 'quark', 'antiquark', 'gluon-(anti)quark', 'and', 'gluon', 'gluon', 'subprocesses', 'are', 'included', 'as', 'well', 'as', 'all', 'orders', 'resummation', 'of', 'initial', 'state', 'gluon', 'radiation', 'valid', 'at', 'next', 'to', 'next', 'to', 'leading', 'logarithmic', 'accuracy', 'the', 'region', 'of', 'phase', 'space', 'is', 'specified', 'in', 'which', 'the', 'calculation', 'is', 'most', 'reliable', 'good', 'agreement', 'is', 'demonstrated', 'with', 'data', 'from', 'the', 'fermilab', 'tevatron', 'and', 'predictions', 'are', 'made', 'for', 'more', 'detailed', 'tests', 'with', 'cdf', 'and', 'do', 'data', 'predictions', 'are', 'shown', 'for', 'distributions', 'of', 'diphoton', 'pairs', 'produced', '

In [13]:
# create list with evey word
words = [word for sublist in lemm_list for word in sublist]

print(words)

['a', 'fully', 'differential', 'calculation', 'in', 'perturbative', 'quantum', 'chromodynamic', 'be', 'present', 'for', 'the', 'production', 'of', 'massive', 'photon', 'pair', 'at', 'hadron', 'collider', 'all', 'next', 'to', 'lead', 'order', 'perturbative', 'contribution', 'from', 'quark', 'antiquark', 'gluon-(anti)quark', 'and', 'gluon', 'gluon', 'subprocesse', 'be', 'include', 'as', 'well', 'as', 'all', 'order', 'resummation', 'of', 'initial', 'state', 'gluon', 'radiation', 'valid', 'at', 'next', 'to', 'next', 'to', 'lead', 'logarithmic', 'accuracy', 'the', 'region', 'of', 'phase', 'space', 'be', 'specify', 'in', 'which', 'the', 'calculation', 'be', 'most', 'reliable', 'good', 'agreement', 'be', 'demonstrate', 'with', 'datum', 'from', 'the', 'Fermilab', 'Tevatron', 'and', 'prediction', 'be', 'make', 'for', 'more', 'detailed', 'test', 'with', 'CDF', 'and', 'do', 'datum', 'prediction', 'be', 'show', 'for', 'distribution', 'of', 'diphoton', 'pair', 'produce', 'at', 'the', 'energy', 'of'

In [14]:
# list with all words without duplicates
words_list = []
for i in range(len(words)):
    if words[i] not in words_list:
        words_list.append(words[i])

In [15]:
print(words_list)

['a', 'fully', 'differential', 'calculation', 'in', 'perturbative', 'quantum', 'chromodynamic', 'be', 'present', 'for', 'the', 'production', 'of', 'massive', 'photon', 'pair', 'at', 'hadron', 'collider', 'all', 'next', 'to', 'lead', 'order', 'contribution', 'from', 'quark', 'antiquark', 'gluon-(anti)quark', 'and', 'gluon', 'subprocesse', 'include', 'as', 'well', 'resummation', 'initial', 'state', 'radiation', 'valid', 'logarithmic', 'accuracy', 'region', 'phase', 'space', 'specify', 'which', 'most', 'reliable', 'good', 'agreement', 'demonstrate', 'with', 'datum', 'Fermilab', 'Tevatron', 'prediction', 'make', 'more', 'detailed', 'test', 'CDF', 'do', 'show', 'distribution', 'diphoton', 'produce', 'energy', 'Large', 'Hadron', 'Collider', 'LHC', 'decay', 'Higgs', 'boson', 'contrast', 'those', 'QCD', 'process', 'that', 'enhance', 'sensitivity', 'signal', 'can', 'obtain', 'judicious', 'selection', 'event', 'we', 'describe', 'new', 'algorithm', '$', 'k,\\ell)$-pebble', 'game', 'color', 'use',

In [17]:
for i in range(10):
    a = len(tokenized_words_list[i])
    print(a)
# mit der Anzahl von Wörtern im Abstract könnte man zuordnen, welche Wörter zu welcher Arbeit gehören
# z.B. Wörter 0 bis 149 gehören zu paper1
# was machen mit gleichen Wörtern in mehreren Arbeiten

150
120
151
37
41
148
157
135
299
107


In [18]:
text_list = [' '.join(words) for words in lemm_list]

print(text_list)

['a fully differential calculation in perturbative quantum chromodynamic be present for the production of massive photon pair at hadron collider all next to lead order perturbative contribution from quark antiquark gluon-(anti)quark and gluon gluon subprocesse be include as well as all order resummation of initial state gluon radiation valid at next to next to lead logarithmic accuracy the region of phase space be specify in which the calculation be most reliable good agreement be demonstrate with datum from the Fermilab Tevatron and prediction be make for more detailed test with CDF and do datum prediction be show for distribution of diphoton pair produce at the energy of the Large Hadron Collider LHC distribution of the diphoton pair from the decay of a Higgs boson be contrast with those produce from QCD process at the LHC show that enhance sensitivity to the signal can be obtain with judicious selection of event', 'we describe a new algorithm the $ k,\\ell)$-pebble game with color a

In [19]:
# list with titles
# title_list=[]
# for i in range(len(df_short)):
#     title_list.append(df_short.title[i])
# print(title_list)

In [20]:
# create lists of attributes 

licenses_list = []
def licenses_in_list(df):
    for i in range(len(df)):
        licenses_list.append(df.license[i])
    print(licenses_list)

doi_list = []
def doi_in_list(df):
    for i in range(len(df)):
        doi_list.append(df.doi[i])
    print(doi_list)

title_list = []
def titles_in_list(df):
    for i in range(len(df)):
        title_list.append(df.title[i])
    print(title_list)

comment_list = []
def comments_in_list(df):
    for i in range(len(df)):
        comment_list.append(df.comments[i])
    print(comment_list)


author_list = []
def authors_in_list(df):
    for i in range(len(df)):
        author_list.append(df.authors_parsed[i])
    print(author_list)


categories_list = []
def categories_in_list(df):
    for i in range(len(df)):
        categories_list.append(df['categories'][i])
    print(categories_list)


journal_list = []
def journals_in_list(df):
    for i in range(len(df)):
        journal_list.append(df['journal-ref'][i])
    print(journal_list)

In [21]:
licenses_in_list(df_short)
doi_in_list(df_short)
titles_in_list(df_short)
comments_in_list(df_short)
authors_in_list(df_short)
categories_in_list(df_short)
journals_in_list(df_short)

[nan, 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', nan, nan, nan, nan, nan, 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', nan, nan]
['10.1103/PhysRevD.76.013009', nan, nan, nan, nan, '10.1103/PhysRevA.75.043613', '10.1103/PhysRevD.76.044016', '10.1063/1.2975338', '10.1086/518646', nan]
['Calculation of prompt diphoton production cross sections at Tevatron and\r\n  LHC energies', 'Sparsity-certifying Graph Decompositions', 'The evolution of the Earth-Moon system based on the dark matter field\r\n  fluid model', 'A determinant of Stirling cycle numbers counts unlabeled acyclic\r\n  single-source automata', 'From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\alpha}$', 'Bosonic characters of atomic Cooper pairs across resonance', 'Polymer Quantum Mechanics and its Continuum Limit', 'Numerical solution of shock and ramp compression for general material\r\n  properties', 'The Spitzer c2d Survey of Large, Nearby, Insterstellar Clouds. IX. The\r\n  Serpens YSO Population As Ob

In [22]:
data = HeteroData()
data['paper'].num_nodes = len(df_short)
data['paper'].license = licenses_list
data['paper'].doi = doi_list
data['paper'].title = title_list
data['paper'].comment = comment_list

data['author'].num_nodes = len(df_short)
data['author'].name = author_list

data['category'].num_nodes = len(df_short)
data['category'].name = categories_list

data['journal'].num_nodes = len(df_short)
data['journal'].name = journal_list

data['word'].num_nodes = len(words_list)
data['word'].name = words_list

data

HeteroData(
  paper={
    num_nodes=10,
    license=[10],
    doi=[10],
    title=[10],
    comment=[10],
  },
  author={
    num_nodes=10,
    name=[10],
  },
  category={
    num_nodes=10,
    name=[10],
  },
  journal={
    num_nodes=10,
    name=[10],
  },
  word={
    num_nodes=542,
    name=[542],
  }
)