In [1]:
from bs4 import BeautifulSoup
import requests
import re

# Classes and functions

In [2]:
# Class paper

class Paper(object):
    """
    Class for arxiv papers. 
    """
    def __init__(self, identifier, title, author_list, abstract, replacement = False, print_abstract = False, max_authors = 3):
        self.identifier = identifier
        self.title = title
        self.author_list = author_list
        self.abstract = abstract
        self.replacement = replacement
        self.print_abstract = print_abstract
        self.max_authors = max_authors
        
    def GetTitle(self):
        return self.title
    
    def GetAuthors(self):
        return selft.author_list
    
    def GetAbstract(self):
        return self.abstract
    
    def GetUrl(self):
        return "https://arxiv.org/abs/"+str(self.identifier)
    
    def IsReplacement(self):
        return self.replacement
    
    def __str__(self):
        paper_str = str(self.title) + "\n"
        ind_max = min(len(self.author_list), self.max_authors)
        for ind in range(ind_max):
            paper_str += self.author_list[ind]
            if ind < (ind_max - 1):
                paper_str += ", "
        if len(self.author_list)> self.max_authors:
            paper_str += " et al."
        paper_str += "\n"+self.GetUrl()
        if self.print_abstract: 
            paper_str += "\n" + self.abstract[:500]
        return paper_str

In [5]:
# Functions

def SearchNewPapers(urls, Keywords_vect, print_abstract = False, print_replacement = False):
    for url in urls:
        html = requests.get(url).text
        soup = BeautifulSoup(html, 'html5lib')

        # Building the lists of numbers and paper identifiers
        all_dt = soup('dt') # numbers and identifiers are contained in dt sections

        all_numbers = [a for dt in all_dt for a in dt('a') if a.has_attr('name')]
        all_numbers_list = [a.text.split()[0] for a in all_numbers]

        all_identifiers = [a for dt in all_dt for a in dt('a') if a.has_attr('href')]
        all_identifiers_list_tmp = [identifier.text.split()[0] for identifier in all_identifiers]

        regex=r"^arXiv"
        all_identifiers_list = [identifier.replace("arXiv:","") for identifier in all_identifiers_list_tmp if re.match(regex,identifier)]

        # Building the list of titles, authors, and abstracts
        all_dd = soup('dd') # titles, authors, and abstracts are contained in dd sections

        all_titles = [div for dd in all_dd for div in dd('div','list-title mathjax')]
        all_authors = [div for dd in all_dd for div in dd('div','list-authors')]
        all_abstracts = [p for dd in all_dd for p in dd('p','mathjax')]

        all_titles_list = [title.text.lower().replace("\n","").replace("title: ","").replace("  "," ") for title in all_titles]
        all_authors_list = [authors.text.lower().replace("\n","").replace("authors: ","").replace("  "," ").split(", ") for authors in all_authors]
        all_abstracts_list = [abstract.text.lower().replace("\n","").replace("  "," ") for abstract in all_abstracts]

        # Search for keywords in titles, authors, and abstract

        interesting_papers = set()

        for keyword in Keywords_vect:
            #print("I'm looking for papers containing: ", keyword)

            interesting_titles = set([index for index, title in enumerate(all_titles_list) if keyword in title])
            interesting_authors = set([index for index, author in enumerate(all_authors_list) if keyword in author])
            interesting_abstract = set([index for index, abstract in enumerate(all_abstracts_list) if keyword in abstract])

            interesting_papers = interesting_papers.union(interesting_titles, interesting_authors,interesting_abstract)

        interesting_papers_list = sorted(interesting_papers)

        # Build output with link to the paper

        # Take care of replacements
        new_papers = len(all_abstracts_list)
        total_papers = len(all_identifiers_list)

        all_abstracts_list_full = all_abstracts_list
        for item in range(new_papers,total_papers):
            all_abstracts_list_full.append("This paper is a replacement.")

        # Print result

        print("Today in", url,"there are", new_papers, "new papers and", total_papers-new_papers+1, "replacements.\n\n")

        todays_papers = []
        for item in interesting_papers_list:
            if item <= new_papers:
                replacement = False
            else: 
                replacement = True
            paper = Paper(all_identifiers_list[item], all_titles_list[item], all_authors_list[item], all_abstracts_list[item], replacement, print_abstract)
            todays_papers.append(paper)

        new_interesting_papers = 0
        for paper in todays_papers:
            if not paper.IsReplacement():
                new_interesting_papers += 1

        print("There are",new_interesting_papers,"new interesting papers and", len(todays_papers)-new_interesting_papers,"replacements. \n")

        for ind, paper in enumerate(todays_papers):
            if not paper.IsReplacement():
                print(str(ind+1)+")")
                print(paper,"\n")


        print("\n************************************\n")

# Main program

In [6]:
#Import keywords from txt file
#Format: Insert keywords as new lines. Lines beginning with # will be ignored. Check if there are accidental white space at the end of each entry!

Keywords_vect=[]

with open('Keywords.txt') as file_for_reading:
    for line in file_for_reading:
        if not re.match('^#', line) and line.strip()!='':
            Keywords_vect.append(line.strip().lower())

print(Keywords_vect)

['superconductivity', 'machine learning', 'time crystal', 'nonequilibrium', 'non-equilibrium', 'non equilibrium', 'rydberg', 'gambetta', 'garrahan', 'foot', 'oreg', 'lesanovsky', 'weibin li']


In [7]:
urls  = ["https://arxiv.org/list/cond-mat/new","https://arxiv.org/list/quant-ph/new","https://arxiv.org/list/physics/new"]

print_abstract = False
print_replacement = False
SearchNewPapers(urls, Keywords_vect, print_abstract, print_replacement)

Today in https://arxiv.org/list/cond-mat/new there are 85 new papers and 33 replacements.


There are 12 new interesting papers and 2 replacements. 

1)
out-of-equilibrium simulations of a classical gas with bose-einstein statistics
marisel di pietro martínez, miguel hoyuelos
https://arxiv.org/abs/2006.06133 

2)
on-the-fly closed-loop autonomous materials discovery via bayesian active learning
a. gilad kusne, heshan yu, changming wu et al.
https://arxiv.org/abs/2006.06141 

3)
collective excitations and nonequilibrium phase transition in dissipative fermionic superfluids
kazuki yamamoto, masaya nakagawa, naoto tsuji et al.
https://arxiv.org/abs/2006.06169 

4)
observation of a strongly ferromagnetic spinor bose-einstein condensate
seungjung huh, kyungtae kim, kiryang kwon et al.
https://arxiv.org/abs/2006.06228 

5)
leading theories of the cuprate superconductivity: a critique
navinder singh
https://arxiv.org/abs/2006.06335 

6)
magnetic monopoles and superinsulation in josephson junc