## Function

In [9]:
import logging
import requests 
from urllib.parse import urlunsplit, urlsplit
from bs4 import BeautifulSoup

logging.basicConfig()
logger = logging.getLogger('PDFs')
logger.setLevel(logging.DEBUG)
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}


class pdfDownload(object):
    def __init__(self):
        self.sess = requests.Session()
        self.sess.headers = HEADERS
        
    def set_proxy(self, proxy=None):
        """set proxy for session
        
        Args:
            proxy (str): The proxy adress. e.g 127.0.1:1123
        Returns:
            None
        """
        if proxy:
            self.sess.proxies = {
                "http": proxy,
                "https": proxy, }
    
    
    def _get_available_scihub_urls(self):
        '''
        Finds available scihub urls via https://lovescihub.wordpress.com/ or 
        https://sci-hub.now.sh/
        '''
        urls = []
        res = self.sess.get('https://lovescihub.wordpress.com/')
        s = BeautifulSoup(res.content, 'html.parser')
        for a in s.find('div', class_="entry-content").find_all('a', href=True):
            if 'sci-hub.' in a['href']:
                urls.append(a['href'])
        return urls
    
        
    def fetch(self, url, auth=None):
        '''Fetch pdf
        
        Args:
            url (str):

        Returns:
            A dict OR None
        '''
        try:
            r = self.sess.get(url, auth=auth)
        
            if r.headers["Content-Type"] != "application/pdf":
                logger.info("Failed to fetch pdf with url: {}".format(url))
            else:
                return {
                    'pdf': r.content,
                    'url': url
                    }
        except:
            logger.error("Failed to open url: {}".format(url))
    
    
    def get_pdf_from_direct_url(self, url, auth=None):
        return self.fetch(url, auth=auth) 
    
    
    def get_pdf_from_sci_hub(self, identifier, auth=None):
        '''Fetch pdf from sci-hub based on doi or url
        
        Args: 
            identifier (str): DOI or url
            auth (tuple): ("user", "passwd")
        
        Returns:
            A dict OR None
        '''
        for base_url in self._get_available_scihub_urls():
            r = self.sess.get(base_url + '/' + identifier, auth=auth)
            soup = BeautifulSoup(r.content, 'html.parser')
            
            pdf_div_names = ['iframe', 'embed']
            for pdf_div_name in pdf_div_names:
                pdf_div = soup.find(pdf_div_name)
                if pdf_div != None:
                    break 
            try:
                url_parts = urlsplit(pdf_div.get('src'))
                if url_parts[1]:
                    if url_parts[0]:
                        pdf_url = urlunsplit((url_parts[0], url_parts[1], url_parts[2], '', ''))
                    else:
                        pdf_url = urlunsplit(('https', url_parts[1], url_parts[2], '', ''))
                else:
                    pdf_url = urlunsplit(('https', urlsplit(base_url)[1], url_parts[2], '', ''))
                    
                return self.fetch(pdf_url, auth)
            except:
                pass
    
        logger.info("Failed to fetch pdf with all sci-hub urls")

    def _save(self, content, path):
        with open(path, "wb") as f:
            f.write(content)
            
class crossrefInfo(object):
    def __init__(self):
        self.sess = requests.Session()
        self.sess.headers = HEADERS
        self.base_url = "http://api.crossref.org/"

    def set_proxy(self, proxy=None):
        """set proxy for session
        
        Args:
            proxy (str): The proxy adress. e.g 127.0.1:1123
        Returns:
            None
        """
        if proxy:
            self.sess.proxies = {
                "http": proxy,
                "https": proxy, }
            
    
    def extract_json_info(self, bib):
        """Extract bib json information from requests.get().json()
        
        Args:
            bib (json object): obtained by requests.get().json()
        
        Returns:
            A dict containing the paper information.
        """
        pub_date = [str(i) for i in bib['published']["date-parts"][0]]
        pub_date = '-'.join(pub_date)

        if 'author' in bib.keys():
            authors = ' and '.join([i["family"]+" "+i['given'] for i in bib['author'] if "family" and "given" in i.keys()])
        else:
            authors = "No author"

        if 'short-container-title' in bib.keys():
            try:
                journal = bib['short-container-title'][0]
            except:
                journal = "No journal"
        else:
            try:
                journal = bib['container-title'][0]
            except:
                journal = "No journal"

        bib_dict = {
            "title": bib['title'][0],
            "author": authors,
            "journal": journal,
            "year": pub_date,
            "url": bib["URL"],
            "pdf_link": bib["link"][0]["URL"],
            "cited_count": bib["is-referenced-by-count"]
        } 
        
        return bib_dict


    def get_info_by_doi(self, doi):
        """Get the meta information by the given paper DOI number. 
        
        Args:
            doi (str): The paper DOI number
            
        Returns:
            A dict containing the paper information. 
            {
                "title": xxx,
                "author": xxx,
                "journal": xxx,
                etc
            } 
            OR
            None
        """
        url = "{}works/{}"
        url = url.format(self.base_url, doi)
        

        r = self.sess.get(url)

        bib = r.json()['message']
        return self.extract_json_info(bib)

            
    
    def get_info_by_title(self, title):
        """Get the meta information by the given paper title. 
        
        Args:
            doi (str): The paper title
            
        Returns:
            A dict containing the paper information. 
            {
                "title": xxx,
                "author": xxx,
                "journal": xxx,
                etc
            }
            OR
            None
            OR
            A list [{}, {}, {}]
        """
        url = self.base_url + "works"
        params = {"query.bibliographic": title, "rows": 20}
        try:
            r = self.sess.get(url, params=params)
            items = r.json()["message"]["items"]
            
            for i, item in enumerate(items):
                
                title_item = item['title'][0]
                try:
                    title_item = title_item.decode("utf-8")
                except:
                    pass
            
                item["title"][0] = title_item

                if title_item.lower() == title.lower():
                    return self.extract_json_info(item)
                
                items[i] = item

            return [self.extract_json_info(it) for it in items]
        except:
            logger.error("Title: {} is error.".format(title)) 

In [10]:
import re
file_path = 'SST.md'


with open(file_path, 'r', encoding='utf-8') as file:
    markdown_content = file.read()

# Define a function to extract PDF filenames and DOI numbers from the markdown text
def extract_doi(markdown_text):
    # Regular expression patterns for extracting PDF filenames and DOI numbers
    
    doi_pattern = r"\{\{(10\.\d{4,5}/[^\}]+)\}\}"

    # Extract all matches
    doi_numbers = re.findall(doi_pattern, markdown_text)

    return doi_numbers

# Extract PDF names and DOI numbers from the markdown content
doi_numbers = extract_doi(markdown_content)

# Display the extracted information
doi_numbers

['10.1038/s41586-019-1559-7',
 '10.1126/sciadv.adf2827',
 '10.1016/j.scib.2021.03.009',
 '10.1038/ncomms9657',
 '10.1038/s41558-021-01276-3']

1. 生成题录

In [5]:
def process_line(doi):
    crossref_info = crossrefInfo()
    crossref_info.set_proxy(proxy="127.0.0.1:7890")
    
    doi_numbers = extract_doi(doi)[0]
    
    bib_doi = crossref_info.get_info_by_doi(doi_numbers)
    pdf = bib_doi['url'].split('/')[-1] + '.pdf'
    bib = bib_doi
    pdf_path = 'pdfs/' + pdf
    replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, ([pdf]({}))([link]({})).".format(
                                    bib['title'], bib["author"].split(" and ")[0], bib['journal'], 
                                    bib['year'], pdf_path, 
                                    bib['url'])
    return replaced_literature

def extract_doi(markdown_text):
    # Regular expression patterns for extracting PDF filenames and DOI numbers
    
    doi_pattern = r"\{\{(10\.\d{4,5}/[^\}]+)\}\}"

    # Extract all matches
    doi_numbers = re.findall(doi_pattern, markdown_text)

    return doi_numbers    

In [7]:
# Process each line in the markdown file
import re
file_path = 'SST.md'
with open(file_path, 'r', encoding='utf-8') as file:
    updated_lines = [process_line(line) for line in file]# Process each line in the markdown file
with open('SST1.md', 'w', encoding='utf-8') as file:
    for line in updated_lines:
        file.write(line + '\n')

2. 下载pdf

In [12]:
pdfDownload.set_proxy("127.0.0.1:7890")
doi_pattern = r"org\/([\w.\/-]+)"
title_pattern = r"\*\*(.*?)\*\*"

# Extract DOI numbers
doi_numbers = [re.search(doi_pattern, entry).group(1) for entry in updated_lines if re.search(doi_pattern, entry)]
titles = [re.search(title_pattern, entry).group(1) for entry in updated_lines if re.search(title_pattern, entry)]

titles

['Deep learning for multi-year ENSO forecasts',
 'A self-attention–based neural network for three-dimensional multivariate modeling and its skillful ENSO predictions',
 'Unified deep learning model for El Niño/Southern Oscillation forecasts by incorporating seasonality in climate data',
 'Increasing water cycle extremes in California and in relation to ENSO cycle under global warming',
 'Enhanced risk of concurrent regional droughts with increased ENSO variability and warming']

In [13]:
doi_numbers

['10.1038/s41586-019-1559-7',
 '10.1126/sciadv.adf2827',
 '10.1016/j.scib.2021.03.009',
 '10.1038/ncomms9657',
 '10.1038/s41558-021-01276-3']

In [14]:
for i in range(len(updated_lines)):
    
    pdf = doi_numbers[i].split('/')[-1] + '.pdf'
    pdf_download = pdfDownload()
    pdf_download.set_proxy("127.0.0.1:7890")


    try:
        pdf_dict = pdf_download.get_pdf_from_sci_hub(doi_numbers[i])
        print(pdf_dict['url'])
        pdf_download._save(pdf_dict['pdf'] ,'pdfs/' + pdf)
    except:
        print('error download' + str(i) + ': ' + pdf)

https://sci.bban.top/pdf/10.1038/s41586-019-1559-7.pdf
error download1: sciadv.adf2827.pdf
https://sci.bban.top/pdf/10.1016/j.scib.2021.03.009.pdf
https://sci.bban.top/pdf/10.1038/ncomms9657.pdf
error download4: s41558-021-01276-3.pdf
