# arXiv_scraper_v1

by _Filippo M. Gambetta_

In [7]:
from bs4 import BeautifulSoup
import requests
import re

## Class Paper and search functions

In [8]:
# Class Paper

class Paper(object):
    """
    Class for arxiv papers.
    """
    def __init__(self, identifier, title, author_list, abstract, replacement = False, print_abstract = False, abstract_max_lenght = 10000, max_authors = 3):
        self.identifier = identifier
        self.title = title
        self.author_list = author_list
        self.abstract = abstract
        self.replacement = replacement
        self.print_abstract = print_abstract
        self.abstract_max_lenght = abstract_max_lenght
        self.max_authors = max_authors
        
    def GetIdentifier(self):
        return self.identifier
    
    def GetTitle(self):
        return self.title
    
    def GetAuthors(self):
        return selft.author_list
    
    def GetAbstract(self):
        return self.abstract
    
    def GetAbstractMaxLenght(self):
        return abstract_max_lenght
    
    def GetUrl(self):
        return "https://arxiv.org/abs/"+str(self.identifier)
    
    def IsReplacement(self):
        return self.replacement
    
    def __str__(self):
        paper_str = str(self.title) + "\n"
        ind_max = min(len(self.author_list), self.max_authors)
        for ind in range(ind_max):
            paper_str += self.author_list[ind]
            if ind < (ind_max - 1):
                paper_str += ", "
        if len(self.author_list)> self.max_authors:
            paper_str += " et al."
        paper_str += "\n"+self.GetUrl()
        if self.print_abstract: 
            paper_str += "\n\n" + self.abstract[:self.abstract_max_lenght] # Regulate printer abstract length
        return paper_str

In [9]:
# Search functions

def SearchNewPapers(urls, Keywords_vect, print_abstract = False, print_replacement = False, repetition = False):
    """
    Search the list of keywords contained in Keywords_vect in titles,
    authors' lists, and abstracts of all papers from a list urls
    """
    
    all_interesting_papers_ids = set()
    
    for url in urls:
        html = requests.get(url).text
        soup = BeautifulSoup(html, 'html5lib')
        
        # Get the date
        announcement_string = soup.find("div", {"class": "list-dateline"}).text.split("announced ")

        # Building the lists of paper identifiers
        all_dt = soup('dt') # numbers and identifiers are contained in dt sections

        all_identifiers = [a for dt in all_dt for a in dt('a') if a.has_attr('href')]
        all_identifiers_list_tmp = [identifier.text.split()[0] for identifier in all_identifiers]

        regex=r"^arXiv"
        all_identifiers_list = [identifier.replace("arXiv:","") for identifier in all_identifiers_list_tmp if re.match(regex,identifier)]

        # Building the list of titles, authors, and abstracts
        all_dd = soup('dd') # titles, authors, and abstracts are contained in dd sections

        all_titles = [div for dd in all_dd for div in dd('div','list-title mathjax')]
        all_authors = [div for dd in all_dd for div in dd('div','list-authors')]
        all_abstracts = [p for dd in all_dd for p in dd('p','mathjax')]
        
        all_titles_list = [title.text.replace("\n","").replace("Title: ","").replace("  "," ") for title in all_titles]
        all_authors_list = [authors.text.replace("\n","").replace("Authors: ","").replace("  "," ").split(", ") for authors in all_authors]
        all_abstracts_list = [abstract.text.replace("\n"," ").replace("  "," ") for abstract in all_abstracts]

        all_titles_list_lower = [title.text.lower().replace("\n","").replace("title: ","").replace("  "," ") for title in all_titles]
        all_authors_list_lower = [authors.text.lower().replace("\n","").replace("authors: ","").replace("  "," ").split(", ") for authors in all_authors]
        all_abstracts_list_lower = [abstract.text.lower().replace("\n"," ").replace("  "," ") for abstract in all_abstracts]

        # Search for keywords in titles, authors' lists, and abstracts

        interesting_papers = set()

        for keyword in Keywords_vect:
            interesting_titles = set([index for index, title in enumerate(all_titles_list_lower) if keyword in title])
            interesting_authors = set([index for index, author in enumerate(all_authors_list_lower) if keyword in author])
            interesting_abstract = set([index for index, abstract in enumerate(all_abstracts_list_lower) if keyword in abstract])
            
            interesting_papers = interesting_papers.union(interesting_titles, interesting_authors,interesting_abstract)
            
        interesting_papers_list = sorted(interesting_papers)

        # Building the output with link to the papers

        # Taking care of replacements
        new_papers = len(all_abstracts_list)
        total_papers = len(all_identifiers_list)

        all_abstracts_list_full = all_abstracts_list
        for item in range(new_papers,total_papers):
            all_abstracts_list_full.append("This paper is a replacement.")

        # Printing results

        print("Today ({})".format(announcement_string[1]),"in", url,"there are", new_papers, "new papers and", total_papers-new_papers, "replacements.\n\n")

        todays_papers = []
        for item in interesting_papers_list:
            if item < new_papers:
                replacement = False
            else: 
                replacement = True
            paper = Paper(all_identifiers_list[item], all_titles_list[item], all_authors_list[item], all_abstracts_list[item], replacement, print_abstract)
            todays_papers.append(paper)

        new_interesting_papers = 0
        already_announced_papers = 0
        for paper in todays_papers:
            if not paper.IsReplacement():
                if paper.GetIdentifier() not in all_interesting_papers_ids:
                    new_interesting_papers += 1
                else:
                    already_announced_papers += 1

        print("There are",new_interesting_papers,"new interesting papers and", len(todays_papers)-new_interesting_papers-already_announced_papers,"interesting replacements. \n")
        
        ind=0
        for paper in todays_papers:
            if not print_replacement:
                if not paper.IsReplacement():
                    if paper.GetIdentifier() not in all_interesting_papers_ids:
                        print(str(ind+1)+")")
                        print(paper,"\n")
                        ind += 1
            else: 
                if paper.GetIdentifier() not in all_interesting_papers_ids:
                    print(str(ind+1)+")")
                    print(paper,"\n")
                    ind += 1
                
        # Creating a set with all ids of already announced papers to avoid repetition
        if not repetition:
            for paper in todays_papers: 
                all_interesting_papers_ids.add(paper.GetIdentifier())

        print("\n************************************\n")

## Main program

In [10]:
#Import keywords from txt file
#Format: Insert keywords as new lines. 
#Lines beginning with # will be ignored. 
#Check if there are accidental white space at the end of each entry!

Keywords_vect=[]

with open('Keywords.txt') as file_for_reading:
    for line in file_for_reading:
        if not re.match('^#', line) and line.strip()!='':
            Keywords_vect.append(line.strip().lower())

#print(Keywords_vect)

In [11]:
urls  = ["https://arxiv.org/list/cond-mat/new",
         "https://arxiv.org/list/quant-ph/new",
         "https://arxiv.org/list/physics.atom-ph/new"]

print_abstract = True
print_replacement = False
repetition = False               # if True, paper already announced in previous section will be announced again
SearchNewPapers(urls, Keywords_vect, print_abstract, print_replacement, repetition)

Today (Tue, 16 Jun 20) in https://arxiv.org/list/cond-mat/new there are 115 new papers and 74 replacements.


There are 29 new interesting papers and 17 interesting replacements. 

1)
Thermodynamic classification of three-dimensional Kitaev spin liquids
Tim Eschmann, Petr A. Mishchenko, Kevin O'Brien et al.
https://arxiv.org/abs/2006.07386

In the field of frustrated magnetism, Kitaev models provide a unique framework to study the phenomena of spin fractionalization and emergent lattice gauge theories in two and three spatial dimensions. Their ground states are quantum spin liquids, which can typically be described in terms of a Majorana band structure and an ordering of the underlying $\mathbb{Z}_2$ gauge structure. Here we provide a comprehensive classification of the "gauge physics" of a family of elementary three-dimensional Kitaev models, discussing how their thermodynamics and ground state order depends on the underlying lattice geometry. Using large-scale, sign-free quantum Mont

Today (Tue, 16 Jun 20) in https://arxiv.org/list/quant-ph/new there are 51 new papers and 35 replacements.


There are 32 new interesting papers and 23 interesting replacements. 

1)
Dirac's Classical-Quantum Analogy for the Harmonic Oscillator: Classical Aspects in Thermal Radiation Including Zero-Point Radiation
Timothy H. Boyer
https://arxiv.org/abs/2006.07468

Dirac's Poisson-bracket-to-commutator analogy for the transition from classical to quantum mechanics assures that for many systems, the classical and quantum systems share the same algebraic structure. The quantum side of the analogy (involving operators on Hilbert space with commutators scaled by Planck's constant $\hbar$) not only gives the algebraic structure but also dictates the average values of physical quantities in the quantum ground state. On the other hand, the Poisson brackets of nonrelativistic mechanics, which give only the classical canonical transformations, do not give any values for physical quantities. Rath

Today (Tue, 16 Jun 20) in https://arxiv.org/list/physics.atom-ph/new there are 10 new papers and 1 replacements.


There are 0 new interesting papers and 1 interesting replacements. 


************************************

