## Preprocessing

In [118]:
import pandas as pd
import re

In [None]:
filename = 'data/arxiv-metadata-oai-snapshot-10000.csv'
df = pd.read_csv(filename)

In [None]:
def extract_pages(s):
    match = re.search(r"(\d+)\s*pages", s)
    if match:
        return int(match.group(1))
    else:
        return None

In [121]:
df['authors_parsed'] = df['authors_parsed'].apply(lambda x: [" ".join(i).strip() for i in eval(x)])
df['versions'] = df['versions'].apply(lambda x: eval(x)[0]["created"])
df['timestamp'] = pd.to_datetime(df['versions'], format="%a, %d %b %Y %H:%M:%S %Z")
df['timestamp'] = df['timestamp'].apply(lambda x: x.timestamp())
df["categories"] = df["categories"].apply(lambda x: x.split(" "))
df.drop(columns=["submitter", "versions", "update_date", "authors"], inplace=True)
df["pages"] = df.comments.apply(lambda x: extract_pages(str(x)))
df.head()

Unnamed: 0,id,title,comments,journal-ref,doi,report-no,categories,license,abstract,authors_parsed,timestamp,pages
0,704.0001,Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,[hep-ph],,A fully differential calculation in perturba...,"[Balázs C., Berger E. L., Nadolsky P. M., Yuan...",1175542000.0,37.0
1,704.0002,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,"[math.CO, cs.CG]",http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[Streinu Ileana, Theran Louis]",1175308000.0,
2,704.0003,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,[physics.gen-ph],,The evolution of Earth-Moon system is descri...,[Pan Hongjun],1175460000.0,23.0
3,704.0004,A determinant of Stirling cycle numbers counts...,11 pages,,,,[math.CO],,We show that a determinant of Stirling cycle...,[Callan David],1175311000.0,11.0
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,"[math.CA, math.FA]",,In this paper we show how to compute the $\L...,"[Abu-Shammala Wael, Torchinsky Alberto]",1175537000.0,


In [1]:
import torch
from torch_geometric.data import HeteroData
data = HeteroData()

data['paper'].num_nodes = 300
data['author'].num_nodes = 200
data['paper', 'written_by', 'author'].edge_index = torch.tensor([
    [1,2,2],
    [100,102,105]
]) 

1,100
2,102
data['author'].x = torch.tensor([
    [3,4,1],
    [1,1,1],
    [0,0,0]
])


data['paper', 'written_by', 'author'].edge_attr = torch.tensor([
    [3,4,1],
    [1,1,1],
    [0,0,0]
])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data

HeteroData(
  paper={ num_nodes=300 },
  author={
    num_nodes=200,
    x=[3, 3],
  },
  (paper, written_by, author)={
    edge_index=[2, 3],
    edge_attr=[3, 3],
  }
)

In [3]:
import torch_geometric.transforms as T

transform = T.Compose([
       T.RemoveIsolatedNodes(),
       T.ToUndirected(merge=False), # don't merge reversed edges into the original edge type
       T.RemoveDuplicatedEdges(),
])
data_before = data
data = transform(data)

In [4]:
data

HeteroData(
  paper={ num_nodes=2 },
  author={
    num_nodes=3,
    x=[3, 3],
  },
  (paper, written_by, author)={
    edge_index=[2, 3],
    edge_attr=[3, 3],
  },
  (author, rev_written_by, paper)={
    edge_index=[2, 3],
    edge_attr=[3, 3],
  }
)

In [50]:
data['paper', 'written_by', 'author'].edge_index

tensor([[0, 1, 1],
        [0, 1, 2]])

In [1]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE
