## Scraper w/o BioPython
This is a Readme file.

In [1]:
def GetURL():
    #stuff

def WebScraper():
    import sys
    import gc
    from datetime import datetime
    import requests as rq
    from requests_html import AsyncHTMLSession
    from bs4 import BeautifulSoup as bs
    from regex import regex as re
    
    #Clear memory
    gc.collect()
    #Create the URL Terms
    base_url = "https://pubmed.ncbi.nlm.nih.gov/"
    size = "200"
    term = 'schizophrenia'
    date1 = '2000%2F1%2F1'
    date2 = '2020%2F11%2F15'
    pmformat = 'abstract'
    #Pulls from the first 50 pages
    pages = list(range(1,3))
    #Create CSV File
    #Timestamp
    now = datetime.now()
    date_time = now.strftime("%Y%m%d%H%M%S")
    filename = f"{term}_{date_time}.csv"
    #Creates a new CSV file w/ keyword+timestamp as the filename
    file = open(filename,'w')
    file.write('Paper Title,Publication Time,Abstract\n')
    
    #Iterate for page numbers
    for page in pages:
        
        #Create URL
        full_url = f"{base_url}?term={term}&filter=dates.{date1}-{date2}&format={pmformat}&tsize={size}&page={page}"
        print(full_url)
        #Wait for page response
        asession = AsyncHTMLSession()
        response = await asession.get(full_url)
        await response.html.arender()
        #Construct the BS object
        page = rq.get(full_url)
        soup = bs(page.content,"html.parser")
        
        #For loop for results on each page
        for result in soup.find_all("div", class_="results-article"):
            #~~~~Article name
            #Find article name in HTML tree
            article_name= result.find("h1",class_="heading-title")
            #Scrape text
            article_string = article_name.a.get_text()
            article_string = article_string.replace(',','')
            #Write using f-string
            file.write(f"{(article_string.strip())},")

            #~~~~Author List, loop for multiple authors
            author_concat = ''
            for author in result.find_all("span",class_="authors-list-item"):
                print(author.a.get_text())
                #author_name = author_name.replace(',',';')
                #author_concat = f"{author_concat}{author_name} ;"
            print(f"{author_concat},")
            #~~~~Publication Time
            #Find publication time in HTML tree
            pub_time = result.find("span",class_='cit')
            #Scrape text
            pub_string = pub_time.get_text()
            #Clean up date after ;
            pub_string = pub_string.split(';',1)[0]
            pub_string = pub_string.replace(',','')
            #Write using f-string
            file.write(f"{pub_string},")

            #~~~~Abstract
            #Find abstract in HTML tree
            abstract = result.find("div",class_='abstract-content selected')
            #Scrape text
            try:
                abstract_string = abstract.p.get_text()
            except AttributeError:
                abstract_string = 'No abstract found?'
            #Remove commas because CSV goes weird (maybe a better way to do this)
            abstract_string = abstract_string.replace(",","")
            #Remove colons
            abstract_string = abstract_string.replace(":","")
            #Remove line feed
            abstract_string = abstract_string.replace("\n","")
            #Remove padded white space
            abstract_string = abstract_string.strip()
            #Remove additional white space with RegEx
            abstract_string = re.sub("\s\s+"," ",abstract_string)
            #Write using f-string
            try:
                file.write(f"{abstract_string}\n")
            except:
                abstract_string = abstract_string.replace(",","")
                abstract_string = abstract_string.replace(":","")
                abstract_string = abstract_string.replace("\n","")
                abstract_string = abstract_string.strip()
                abstract_string = abstract_string.encode(sys.stdout.encoding, errors='replace')
                abstract_string = re.sub("\s\s+"," ",abstract_string)
                file.write(f"{abstract_string}\n")
    #Close file
    file.close()
    #Complete status
    print('done')
WebScraper()

IndentationError: expected an indented block after function definition on line 1 (1477821058.py, line 4)

## Scraper w/ BioPython

In [1]:
def BioWebScraper():
    import sys
    import gc
    from datetime import datetime
    import requests as rq
    from requests_html import AsyncHTMLSession
    from bs4 import BeautifulSoup as bs
    from regex import regex as re
    import Bio
    from Bio import Entrez
    from tqdm import tqdm
    
    #Clear memory
    gc.collect()
    
    #Create Field Terms
    dbase = 'pubmed'
    mesh_term = 'schizophrenia'
    num_articles = '500' #Number of articles to pull
    begin = '2000/01/01'
    end = '2022/11/15'
    
    #TimeStamp for file creation
    now = datetime.now()
    date_time = now.strftime("%Y%m%d")
    
    #Creates a new CSV file w/ keyword+timestamp as the filename
    filename = f"{mesh_term}_{date_time}.csv"
    file = open(filename,'w')
    file.write('Paper Title,Authors,Publication Date,Abstract\n')
    
    #Connect to PubMed
    Entrez.email = "acking1187@gmail.com"
    handle = Entrez.esearch(db=dbase, term=mesh_term, retmax=num_articles,mindate=begin,maxdate=end)
    records = Entrez.read(handle)
    
    #Parse XML into the CSV
    i=0
    for record in tqdm(records['IdList']):
        entry = Entrez.efetch(db='pubmed', id=record,retmode='xml')
        soup = bs(entry,"xml")
        #print(soup.prettify())
        
        #Get Paper Title
        article_title = soup.find("ArticleTitle").get_text().replace(",","")
        article_title = article_title.encode(sys.stdout.encoding, errors='replace')
        #print(article_title)
        
        #Get Authors
        namelist = ''
        authors = soup.find("AuthorList")
        for author in authors.find_all("Author"):
            try:
                lastname = author.find('LastName').get_text()
            except:
                lastname = ''
            try:
                firstname = author.find('ForeName').get_text()
            except: 
                firstname = ''
            if firstname=="" and lastname=="":
                continue
            name = f"{lastname} {firstname}"
            namelist = f"{namelist}{name}; "
        namelist = namelist.encode(sys.stdout.encoding, errors='replace')
        #print(namelist)
        
        #Get Publication Date
        pub_date = soup.find("PubDate").get_text()
        pub_date = re.sub(r'(\d{4})(.{3})(\d{2})',r'\1 \2 \3',pub_date)
        pub_date = pub_date.encode(sys.stdout.encoding, errors='replace')
        #print(pub_date)
        
        #Get Abstract
        try:
            abstract = soup.find("AbstractText").get_text().replace(",","")
        except:
            abstract = 'No abstract.'
        abstract = abstract.encode(sys.stdout.encoding, errors='replace')
        #print(abstract)
        
        #Create CSV Entry
        entry = f"{article_title},{namelist},{pub_date},{abstract}\n"
        file.write(entry)
        
BioWebScraper()

  6%|▌         | 30/500 [00:14<03:39,  2.14it/s]


AttributeError: 'NoneType' object has no attribute 'find_all'