In [235]:
import spacy
from spacy import displacy
import json
from collections import defaultdict
from whoosh.index import create_in
from whoosh.analysis import StandardAnalyzer
from whoosh.writing import AsyncWriter
from whoosh.fields import *
from whoosh.qparser import QueryParser
import os.path
import re

### Load Model and Dataset

In [12]:
ARXIV = "arxiv-metadata-oai-snapshot.json"
TEXT = 'text.json'

In [50]:
nlp = spacy.load("en_core_web_md")
arxiv = open(ARXIV, 'r')
ARCSIZE = 1796911

In [17]:
arxivList = []
cnt = 0
for articles in arxiv:
    cnt += 1
    arxivDict = json.loads(articles)
    arxivList.append(arxivDict)
    print("\r {0:2.2f}%".format(cnt/ARCSIZE*100), end='')

 105.70%

In [290]:
CORPUS_SIZE = 10000
cnt = 0
subjPair = []
for article in arxivList:
    cnt += 1
    abstract = article['abstract'].replace("\n", ' ').replace(r'\\'and'$', '').strip()
    doc = nlp(abstract)
    for sents in doc.sents:
        offset = sents[0].i
        sentsTextListBK = [token.text for token in sents]
#         chunkSet = set()
        for chunk in sents.noun_chunks:
            if "subj" in chunk.root.dep_:
                sentsTextList = sentsTextListBK.copy()
                sth = chunk[0].i-offset
                eth = chunk[-1].i-offset
                sentsTextList[sth] = "<mark>" + sentsTextList[sth]
                sentsTextList[eth] = sentsTextList[eth] + "</mark>"
                subjHLText = " ".join(sentsTextList)
                subjHLText = re.sub(r' (?=[^\w|\(|<])', '', subjHLText)
                subjHLText = re.sub(r'(?<=\[|\(|-) +', '', subjHLText)
                subjPair.append((chunk.text, subjHLText))                
#                 chunkSet.add(chunk.text)
#         subjHLText = " ".join(sentsTextList)
#         subjHLText = re.sub(r' (?=[^\w|\(|<])', '', subjHLText)
#         subjHLText = re.sub(r'(?<=\[|\(|-) +', '', subjHLText)
#         for chunktext in chunkSet:
#             subjPair.append((chunktext, subjHLText))

    print("\r {0:2.2f}%".format(cnt/CORPUS_SIZE*100), end='')
    if cnt == CORPUS_SIZE:
        break

 100.00%

In [291]:
schema = Schema(subj=TEXT(stored=True), content=TEXT(stored=True))
if not os.path.exists("myindex"):
    os.mkdir("myindex")
ix = create_in("myindex", schema)
writer = ix.writer()
cnt = 0
pairNum = len(subjPair)
for (subjText, sentText) in subjPair:
    cnt += 1
    writer.add_document(subj=subjText, content=sentText)
    print("\r {0:2.2f}%".format(cnt/pairNum*100), end='')
writer.commit()

 100.00%

### Local Test

In [54]:
def searchArxiv(key_words, ix=ix):
    with ix.searcher() as searcher:
        parser = QueryParser("subj", ix.schema)
        query = parser.parse(key_words)
        results = searcher.search(query, limit=20)
        if len(results) != 0:
            for result in results:
                print(result['content'])
        return len(results)

In [264]:
para = "In this work, we evaluate the lifetimes of the doubly (charmed) baryons \Xi_{cc}^{+}, \Xi_{cc}^{++} and \Omega_{cc}^{+}. We carefully calculate the non-spectator contributions at the quark level where the Cabibbo-suppressed diagrams are also included. The hadronic matrix elements are evaluated in the simple non-relativistic harmonic oscillator model. Our numerical results are generally consistent with that obtained by other authors who used the diquark model. However, all the theoretical predictions on the lifetimes are one order larger than the upper limit set by the recent SELEX measurement. This discrepancy would be clarified by the future experiment, if more accurate experiment still confirms the value of the SELEX collaboration, there must be some unknown mechanism to be explored." 

In [289]:
para = para.replace(r'\\'and'$', '')
doc = nlp(para)
for sents in doc.sents:
    off = sents[0].i
    sentsTextList = [token.text for token in sents]
    for chunk in sents.noun_chunks:
        if "subj" in chunk.root.dep_:
            sth = chunk[0].i-off
            eth = chunk[-1].i-off
            sentsTextList[sth] = "<mark>" + sentsTextList[sth]
            sentsTextList[eth] = sentsTextList[eth] + "</mark>"
    subjHLText = " ".join(sentsTextList)
    subjHLText = re.sub(r' (?=[^\w|\(|<])', '', subjHLText)
    subjHLText = re.sub(r'(?<=\[|\(|-) +', '', subjHLText)
    print(subjHLText)

In this work, <mark>we</mark> evaluate the lifetimes of the doubly (charmed) baryons\Xi_{cc}^{+},\Xi_{cc}^{++} and\Omega_{cc}^{+}.
<mark>We</mark> carefully calculate the non-spectator contributions at the quark level where <mark>the Cabibbo-suppressed diagrams</mark> are also included.
<mark>The hadronic matrix elements</mark> are evaluated in the simple non-relativistic harmonic oscillator model.
<mark>Our numerical results</mark> are generally consistent with that obtained by other authors <mark>who</mark> used the diquark model.
However, <mark>all the theoretical predictions</mark> on the lifetimes are one order larger than the upper limit set by the recent SELEX measurement.
<mark>This discrepancy</mark> would be clarified by the future experiment, if <mark>more accurate experiment</mark> still confirms the value of the SELEX collaboration, there must be some unknown mechanism to be explored.
