In [1]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4 import NavigableString

In [8]:
JN_url = "https://www.jneurosci.org"
#authorList = []

In [2]:
#Find the links with a href tag and a specific class
#Return the content as a list of string
def find_href(soup, clas):
    data_list = soup.find_all('a', class_ = clas)
    href_list = [tag['href'] for tag in data_list if tag.has_attr('href')]
    return href_list

#Get html of a link
#Retrun the html as a beautifulsoup object
def get_soup (short_url):
    cur_url = '%s%s' % (JN_url, short_url)
    cur_r = requests.get(cur_url)
    cur_soup = BeautifulSoup(cur_r.content, "html.parser")
    return cur_soup

#Get the words in between two strings
#Return a string
def get_inBetween (string, start, end):
    sub1 = start
    sub2 = end
    idx1 = string.index(sub1)
    idx2 = string.index(sub2)
    return string[idx1 + len(sub1) + 1: idx2]

#Get the name of author from a sentence
#Return author name as a string if successful, an empty string if not
def get_name(string):
    name = ""
    if string.endswith("at "):
        if string.startswith("Correspo"):
            name = get_inBetween(string, "to", " at")
        elif string.startswith ("or") or string.startswith(" or"):
            name = get_inBetween(string, "or", " at")
        elif string.startswith(","):
            name = get_inBetween(string, ",", " at")
        else:
            name = string
            print("get name failed")
    else:
        name = string
        print("get name failed")
    return name

    

In [3]:
#Get the title, abstract and author information of an article
#Return the information as a dictonary, author information as a 
#nested dictionary matching author with emails
def get_journal_info (j_info, j_full):

#Get the article title
    a_title = j_info.find('h1', class_ = "highwire-cite-title", id = "page-title")
    title_text = ""
    for string in a_title.strings:
        title_text = title_text + string
    print("processing:", title_text)

#Get the abstract of the article
    abstract = j_full.find('div', class_ = "section abstract")
    if abstract is not None:
        abstract_text = ""
        for string in abstract.strings:
            abstract_text = abstract_text + string
        abstract_text = abstract_text[8:]
    else:
        abstract_text = "No abstract"

#Get the correpsondence author list
    cor_class = j_info.find('li', class_ = "corresp")
    corrsp_list =  []

    #Create a list of author information from the raw html
    for child in cor_class.descendants:
        if isinstance(child, NavigableString):
            corrsp_list.append(child)
    #Remove the last item if it is a "."
    if corrsp_list [-1] == ".\n":
        corrsp_list = corrsp_list[:-1]
    #print(corrsp_list)
    corrsp_dict = {}
    #Build the dictionary of author information
    for item in corrsp_list:
        cur_index =  corrsp_list.index(item)
        if cur_index%2 == 0:
            try:
                cur_name = get_name(item)
                cur_email = corrsp_list[cur_index + 1]
                cur_email = cur_email.replace("{at}", "@")
                corrsp_dict[cur_name] = cur_email
            except:
                print("Email problem")

    
    j_data = {"Title": title_text,
              "Abstract" : abstract_text,
              "Correspondence " : corrsp_dict}
    
    return j_data

#Return a pd dataframe of the article information
def to_df (dict):
    df = pd.DataFrame(data=None)
    corr = dict['Correspondence ']
    key_list = list(corr.keys())
    counter = 0
    for name in corr:
        if counter == 0:
            df.at[counter, 'Title'] = dict['Title']
            df.at[counter, 'Abstract'] = dict['Abstract']
        df.at[counter, 'Author'] = key_list[counter]
        df.at[counter, 'Email'] = corr[key_list[counter]]
        counter += 1
    return df

In [4]:
URL = "https://www.jneurosci.org/content/by/year"
#r = requests.get(URL)
#print(r.content)

In [5]:
#Create the output diractory
work_dir = os.getcwd()
output_dir = os.path.join(work_dir, "output")
os.makedirs(output_dir, exist_ok= True)

In [6]:
#Create a list of link of the archive from the starting year to 2024
#Ione asked for the past 10 year, so starting year should be 2014
starting_year =  2023
year_list_url = []
while starting_year <= 2024:
    cur_url = ''.join([URL, "/", str(starting_year)])
    year_list_url.append(cur_url)
    starting_year += 1

In [9]:
#Make the csv files for each yaer
for url in year_list_url :
    #Get the current year link and html object
    cur_r = requests.get(url)
    cur_year = url[-4:]
    issue_soup = BeautifulSoup(cur_r.content, 'html.parser')
    issue_href = find_href(issue_soup, 'hw-issue-meta-data')
    output = []

    #Go through every issue in cur_year
    #Loop through the entire list instead of :2 to get everything
    for link in issue_href[:2]:
        print ("Working on", link)

        #Find link to articles
        article_html = get_soup(link)
        article_href = find_href(article_html, 'highwire-cite-linked-title')
        good_href = []
        for href in article_href:
            if href.startswith('%s%s'%(link, "/")) and not href.startswith('%s%s'%(link, "/etwij")):
                good_href.append(href)

        #Get a list of all journal info in the current issue
        issue_info = []
        for link in good_href:
            #Get the beautifulsoup object needed to find information
            j_info_link = j_info_link = '%s%s' % (link, "/tab-article-info")
            j_info = get_soup(j_info_link)
            j_full = get_soup(link)

            #Add info to the list if author information is successfully extracted
            if j_info.find('li', class_ = "corresp") is not None:
                cur_j_data = get_journal_info(j_info, j_full)
                issue_info.append(cur_j_data)

        #Create a list of issue information dataframe
        for article in issue_info:
            cur_df = to_df(article)
            output.append(cur_df)

    #Print the result to csv
    results = pd.concat(output)
    file_name = f"{cur_year}.csv"
    results.to_csv(os.path.join(output_dir, file_name), mode= 'w', index= False, encoding= 'utf-8-sig')

Working on /content/43/50
processing: Evidence for a Primary Prior Deficit as a Mechanism of Auditory Hallucinations
processing: Short- and Long-Term High-Fat Diet Exposure Differentially Alters Phasic and Tonic GABAergic Signaling onto Lateral Orbitofrontal Pyramidal Neurons


KeyboardInterrupt: 