In [2]:
import time
import urllib2
import datetime
from itertools import ifilter
from collections import Counter, defaultdict
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
#import bibtexparser

pd.set_option('mode.chained_assignment','warn')

In [3]:
%matplotlib inline

In [4]:
OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"

def harvest(arxiv="physics:hep-ex"):
    df = pd.DataFrame(columns=("title", "abstract", "categories", "created", "id", "doi"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = (base_url +
           "from=2010-01-01&until=2014-12-31&" +
           "metadataPrefix=arXiv&set=%s"%arxiv)
    
    while True:
        print "fetching", url
        try:
            response = urllib2.urlopen(url)
            
        except urllib2.HTTPError, e:
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print "Got 503. Retrying after {0:d} seconds.".format(to)

                time.sleep(to)
                continue
                
            else:
                raise
            
        xml = response.read()

        root = ET.fromstring(xml)

        for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
            arxiv_id = record.find(OAI+'header').find(OAI+'identifier')
            meta = record.find(OAI+'metadata')
            info = meta.find(ARXIV+"arXiv")
            created = info.find(ARXIV+"created").text
            created = datetime.datetime.strptime(created, "%Y-%m-%d")
            categories = info.find(ARXIV+"categories").text

            # if there is more than one DOI use the first one
            # often the second one (if it exists at all) refers
            # to an eratum or similar
            doi = info.find(ARXIV+"doi")
            if doi is not None:
                doi = doi.text.split()[0]
                
            contents = {'title': info.find(ARXIV+"title").text,
                        'id': info.find(ARXIV+"id").text,#arxiv_id.text[4:],
                        'abstract': info.find(ARXIV+"abstract").text.strip(),
                        'created': created,
                        'categories': categories.split(),
                        'doi': doi,
                        }

            df = df.append(contents, ignore_index=True)

        # The list of articles returned by the API comes in chunks of
        # 1000 articles. The presence of a resumptionToken tells us that
        # there is more to be fetched.
        token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
        if token is None or token.text is None:
            break

        else:
            url = base_url + "resumptionToken=%s"%(token.text)
            
    return df
    

In [5]:
df = harvest()

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2010-01-01&until=2014-12-31&metadataPrefix=arXiv&set=physics:hep-ex
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|1001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|2001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|3001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|4001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|5001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|6001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|7001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|8001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|9001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2398479|10001
fetching http://export.arxiv.or

In [6]:
df.head()

Unnamed: 0,title,abstract,categories,created,id,doi
0,Measurement of B(D_S^+ --> ell^+ nu) and the D...,We examine e+e- --> Ds- Ds*+ and Ds*- Ds+ inte...,"[hep-ex, hep-lat, hep-ph]",2007-04-03,704.0437,10.1103/PhysRevD.76.072002
1,A unified analysis of the reactor neutrino pro...,We present in this article a detailed quantita...,[hep-ex],2007-04-04,704.0498,10.1088/1742-6596/110/8/082013
2,Measurement of Decay Amplitudes of B -->(c cba...,We perform the first three-dimensional measure...,[hep-ex],2007-04-04,704.0522,10.1103/PhysRevD.76.031102
3,Measurement of the Decay Constant $f_D{_S^+}$ ...,We measure the decay constant fDs using the Ds...,"[hep-ex, hep-lat, hep-ph]",2007-04-04,704.0629,10.1103/PhysRevLett.99.071802
4,"The $e^+ e^-\to K^+ K^- \pi^+\pi^-$, $K^+ K^- ...",We study the processes $e^+ e^-\to K^+ K^- \pi...,[hep-ex],2007-04-04,704.063,10.1103/PhysRevD.76.012008


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12399 entries, 0 to 12398
Data columns (total 6 columns):
title         12399 non-null object
abstract      12399 non-null object
categories    12399 non-null object
created       12399 non-null datetime64[ns]
id            12399 non-null object
doi           8263 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 581.3+ KB


## LDA examples
https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

In [10]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [11]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [13]:
raw = doc_a.lower()
tokens = tokenizer.tokenize(raw)

print(tokens)

['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'likes', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother']


In [15]:
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')

In [16]:
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]

print(stopped_tokens)

['brocolli', 'good', 'eat', 'brother', 'likes', 'eat', 'good', 'brocolli', 'mother']


In [17]:
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [20]:
# stem token
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts = stemmed_tokens
print(stemmed_tokens)

['brocolli', 'good', 'eat', 'brother', u'like', 'eat', 'good', 'brocolli', 'mother']


In [22]:
texts = []
for d in doc_set:
    raw = d.lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)

In [27]:
from gensim import corpora, models

dictionary = corpora.Dictionary(texts)

In [28]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [29]:
print(corpus[0])

[(0, 2), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)]


In [31]:
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

In [32]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, u'0.125*"health" + 0.050*"increas" + 0.050*"blood"'), (1, u'0.059*"drive" + 0.059*"pressur" + 0.059*"seem"'), (2, u'0.082*"good" + 0.082*"brocolli" + 0.081*"brother"')]


In [33]:
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [34]:
print(ldamodel.print_topics(num_topics=2, num_words=4))

[(0, u'0.087*"good" + 0.087*"brocolli" + 0.063*"mother" + 0.063*"brother"'), (1, u'0.067*"drive" + 0.066*"pressur" + 0.039*"school" + 0.039*"seem"')]


## Back to the ArXiv df

In [36]:
abstracts = df['abstract'].tolist()

In [37]:
texts = []
for d in abstracts:
    raw = d.lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)

In [38]:
dictionary = corpora.Dictionary(texts)

In [39]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [42]:
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20)

In [41]:
print(ldamodel.print_topics(num_topics=20, num_words=5))

[(0, u'0.018*"model" + 0.014*"mass" + 0.012*"can" + 0.010*"neutrino" + 0.008*"effect"'), (1, u'0.024*"neutrino" + 0.016*"detector" + 0.016*"experi" + 0.012*"energi" + 0.010*"measur"'), (2, u'0.031*"decay" + 0.030*"b" + 0.015*"d" + 0.014*"pi" + 0.013*"cp"'), (3, u'0.026*"2" + 0.014*"q" + 0.014*"gev" + 0.013*"p" + 0.013*"data"'), (4, u'0.024*"mass" + 0.021*"1" + 0.021*"boson" + 0.017*"model" + 0.016*"gev"'), (5, u'0.024*"measur" + 0.018*"jet" + 0.017*"product" + 0.017*"collis" + 0.015*"cross"'), (6, u'0.085*"0" + 0.027*"pi" + 0.024*"1" + 0.023*"2" + 0.023*"b"')]
