In [32]:
# 从Sci-hub下载指定论文的pdf，支持批量下载，记录下载失败的论文标题到日志文件
import requests
import re

# Necessary variable settings
url = 'https://sci-hub.se/'

# 填写需要下载的论文标题
papers = [
        "Aberrant Frontoparietal Function during Recognition Memory in Schizophrenia: A Multimodal Neuroimaging Investigation",
        "Eluding the illusion? Schizophrenia, dopamine and the McGurk effect",
        "Reduced multisensory integration in patients with schizophrenia on a target detection task",
        "Event-related potentials to auditory and visual selective attention in schizophrenia",
        "Response selection impairment in schizophrenia transcends sensory and motor modalities",
        "Disappearance of the unmasking effect of temporally pre-presented lipreading cues on speech recognition in people with chronic schizophrenia",
        "Activation and Functional Connectivity of the Left Inferior Temporal Gyrus during Visual Speech Priming in Healthy Listeners and Listeners with Schizophrenia",
        "Structural and functional brain abnormalities in drug-naive, first-episode, and chronic patients with schizophrenia: a multimodal MRI study",
        "Multisensory integration in schizophrenia: a behavioural and eventrelated potential study",
        "Multidimensional analysis of the abnormal neural oscillations associated with lexical processing in schizophrenia",
        "Audiovisual Temporal Processing in Children and Adolescents With Schizophrenia and Children and Adolescents With Autism: Evidence From Simultaneity-Judgment Tasks and Eye-Tracking Data",
        "Audiovisual temporal processing in adult patients with first-episode schizophrenia and high-functioning autism",
        "Neural Correlates of Audiovisual Temporal Binding Window in Individuals With Schizotypal and Autistic Traits: Evidence From Resting-State Functional Connectivity"
    
    ]


# HTTP请求头，用于模拟浏览器访问，避免被Sci-hub拒绝
headers = {
    'referer': 'https://sci-hub.se/',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
}


# 检查url是否为空，若空则抛出异常
def get_html(url, papers = None, headers = None):
    if not url:
        raise Exception("url is None, please check...")

    if isinstance(papers, list): #若是列表，就批量处理
        data = [{'sci-hub-plugin-check': '', 'request':p} for p in papers]
        res = [requests.post(url, data = d, headers = headers) for d in data]
        res_text = [r.text for r in res]
        return res_text
    else: #若papers不是列表，就单篇处理
        data = {'sci-hub-plugin-check': '', 'request':papers}
        res = requests.post(url, data = data, headers=headers)
        return res.text

#找到pdf文件的下载链接
def get_pdf_path(html, pattern = "location.href='(.*?pdf)"):
    pat = re.compile(pattern)

    if isinstance(html, list):
        pdf_path = [pat.findall(h) for h in html]
    else:
        pdf_path = pat.findall(html)
    return pdf_path

def get_pdf(path):
    res = []
    if isinstance(path, list):

        for p in path:
            try:
                pdf = requests.get(p).content
            except Exception:
                pdf = None
            res.append(pdf)
    else:
        res.append(requests.get(path).content)

    return res


def main(url, papers = None, headers = None, pattern = "location.href='(.*?pdf)"):
    print(f"===== Get html =====")
    html = get_html(url, papers, headers)

    print(f"===== Get pdf path =====")
    pdf = get_pdf_path(html, pattern = pattern)
    pdf_path = ['https:'+p[0] if len(p) > 0 else None for p in pdf]

    print(f"===== Get pdf content =====")
    pdf_content = get_pdf(pdf_path)

    if not isinstance(papers, list):
        papers = [papers]

    with open("log.txt", "w") as f_log:
        for idx,p in enumerate(pdf_content):
            print(f"{idx+1}: {papers[idx]} \n   downloading...")
            if p == None:
                print(f"   failed and log into log_file...")
                f_log.write(f"{papers[idx]}\n")
            else:
                print(f"   success")
                pdf_name = re.sub('[\/:?*"<>|]*', '', papers[idx])
                pdf_name = f"{pdf_name}.pdf"

                with open(pdf_name, "wb") as f_pdf:
                    f_pdf.write(p)

# compatible with package
if __name__ == "__main__":
    main(url, papers = papers, headers = headers)

===== Get html =====
===== Get pdf path =====
===== Get pdf content =====
1: Aberrant Frontoparietal Function during Recognition Memory in Schizophrenia: A Multimodal Neuroimaging Investigation 
   downloading...
   success
2: Eluding the illusion? Schizophrenia, dopamine and the McGurk effect 
   downloading...
   success
3: Reduced multisensory integration in patients with schizophrenia on a target detection task 
   downloading...
   success
4: Event-related potentials to auditory and visual selective attention in schizophrenia 
   downloading...
   success
5: Response selection impairment in schizophrenia transcends sensory and motor modalities 
   downloading...
   success
6: Disappearance of the unmasking effect of temporally pre-presented lipreading cues on speech recognition in people with chronic schizophrenia 
   downloading...
   success
7: Activation and Functional Connectivity of the Left Inferior Temporal Gyrus during Visual Speech Priming in Healthy Listeners and Listene