## Scraping Titles and Authors

Scraped Titles and Authors for 1999 to 2024 (Used existing scraped files for 1996 and 1997)

In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pdfplumber
import re
import pandas as pd
from tqdm import tqdm
import csv

In [2]:
def csv_to_dict(csv_file, year):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    if year == 1996:
        if 'Title' in df.columns and ('Author' in df.columns):
            result_dict = {row['Title']: (row['Author'], year) for index, row in df.iterrows()}
            return result_dict
        else:
            raise ValueError("CSV file must contain 'title', 'author', and 'year' columns")
    elif year == 1997:
        if 'Title' in df.columns and ('Authors' in df.columns):
            result_dict = {row['Title']: (row['Authors'], year) for index, row in df.iterrows()}
            return result_dict
        else:
            raise ValueError("CSV file must contain 'title', 'author', and 'year' columns")
    else:
        raise ValueError("year must be 1996 or 1997")

paper_meta = csv_to_dict('1996.csv',1996)
paper_meta.update(csv_to_dict('1997.csv',1997))

In [3]:
def fetch_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {str(e)}")
        return None

def clean_author(author_text):
    author_text = re.sub(r'\s+', ' ', author_text)  # Replace multiple spaces with a single space
    author_text = author_text.replace('\n', ' ')  # Replace newlines with a space
    return author_text

def clean_title(title):
    # Remove excess whitespace and line breaks
    title = re.sub(r'\s+', ' ', title)  # Replace multiple spaces with a single space
    title = title.replace('\n', ' ')  # Replace newlines with a space
    title = title.strip()  # Remove leading/trailing whitespace
    return title

In [4]:
def extract_data(html, year, entries):
    if not html:
        return {}
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup.find_all(['p', 'dt', 'h3'])
    
    skip_section = False

    for tag in tags:
        if tag.name == 'h3':
            if 'workshops' in tag.text.lower():
                skip_section = True
                continue  # Skip the workshops section header itself
        
        if skip_section:
            continue  # Skip all tags when in the Workshops section
        
        if tag.name in ['p', 'dt']:
            a_tag = tag.find('a', href=lambda href: href and href.lower().endswith('.pdf'))
            if a_tag:
                title = clean_title(a_tag.text.strip())
                if title not in ["Preface", "Session Introduction", "Introduction"] and not title.endswith("introduction"):
                    author_tag = tag.find('i') or tag.find('b') if tag.name == 'p' else tag.find_next_sibling('dd')
                    if author_tag:
                        author_text = author_tag.text.strip().split(';')[0]
                        if title not in entries:
                            entries[title] = (clean_author(author_text), year)
                    
    return entries

def extract_data_2002(html, year, entries):
    if not html:
        return {}
    soup = BeautifulSoup(html, 'html.parser')

    # Find all dt and dd tags
    dt_tags = soup.find_all('dt')
    dd_tags = soup.find_all('dd')

    for dt, dd in zip(dt_tags, dd_tags):
        # Get the title, which is the text of the a tag within dt
        title_tag = dt.find('a')
        title = clean_title(title_tag.text.strip())
        if title != "Session Introduction":
            # Get the authors, which is the text of the i tag within dd
            authors_tag = dd.find('i')
            if authors_tag:
                author_text = authors_tag.text.strip().split(';')[0]
                authors = authors_tag.text.strip()
                entries[title] = (clean_author(author_text), year)
                    
    return entries

In [5]:
def collect_titles_authors_years(start_year, end_year):
    base_url = "http://psb.stanford.edu/psb-online/proceedings/psb"
    titles_authors_years = {}

    for year in range(start_year, end_year + 1):
        year_suffix = str(year)[-2:]
        url = f"{base_url}{year_suffix}/"
        html_content = fetch_html(url)
        if year == 2002:
            data = extract_data_2002(html_content, year, titles_authors_years)
        else:
            data = extract_data(html_content, year, titles_authors_years)

    return titles_authors_years

In [6]:
paper_meta.update(collect_titles_authors_years(1998, 2024))
for title, info in paper_meta.items():
    if info[1] == 2020:
        print(f"Title: {title}, First Author: {info[0]}, Year: {info[1]}")

Title: Predicting Longitudinal Outcomes of Alzheimer's Disease via a Tensor-Based Joint Classification and Regression Model, First Author: Lodewijk Brand, Kai Nichols, Hua Wang, Heng Huang, Li Shen, for the ADNI, Year: 2020
Title: Robustly Extracting Medical Knowledge from EHRs: A Case Study of Learning a Health Knowledge Graph, First Author: Irene Y. Chen, Monica Agrawal, Steven Horng, David Sontag, Year: 2020
Title: Increasing Clinical Trial Accrual via Automated Matching of Biomarker Criteria, First Author: Jessica W. Chen, Christian A. Kunder, Nam Bui, James L. Zehnder, Helio A. Costa, Henning Stehr, Year: 2020
Title: Addressing the Credit Assignment Problem in Treatment Outcome Prediction Using Temporal Difference Learning, First Author: Sahar Harati, Andrea Crowell, Helen Mayberg, Shamim Nema, Year: 2020
Title: Multiclass Disease Classification from Microbial Whole-Community Metagenomes, First Author: Saad Khan, Libusha Kelly, Year: 2020
Title: LitGen: Genetic Literature Recommen

## Get DOIs for as many papers as possible

In [7]:
def get_doi(title, author, year):
    url = "https://api.crossref.org/works"
    params = {
        "query.bibliographic": f"{title.replace(' ', '+')}+{author.replace(' ', '+')}+{year}",
        "rows": 3,
        "mailto": "sameeksha.garg@dartmouth.edu"
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

def extract_doi(data, year):
    # Extract the top two results
    items = data['message']['items']
    
    # Find the DOI that starts with 10.1142
    for item in items:
        doi = item['DOI']
        if doi.startswith('10.1142') and item["container-title"] == [f"Biocomputing {(int(year))}"]:
            return doi
    return None

for title, info in tqdm(paper_meta.items(), desc="Getting DOIs"):
    doi = extract_doi(get_doi(title, info[0], info[1]), info[1])
    paper_meta[title] = (info[0], info[1], doi)
#     print(f"DOI: {doi}")

Getting DOIs:  36%|██████████████████▏                                | 454/1273 [29:24<53:03,  3.89s/it]


KeyboardInterrupt: 

In [None]:
data = []
for title, details in paper_meta.items():
    authors, year = details
    data.append({'Title': title, 'Authors': authors, 'Year': year})

df = pd.DataFrame(data)

## Get PubMed IDs for all papers

Not all papers have DOIs and this makes querying scopus much easier. Used MetaPub (https://github.com/metapub/metapub) (https://pypi.org/project/metapub/)

In [None]:
import nltk
import re
from nltk.corpus import stopwords
import metapub
from time import sleep

In [None]:
os.environ['NCBI_API_KEY'] = '4216f1a2a91c969d346d66f491930ec94508'

In [None]:
# Download the stop words list
nltk.download('stopwords')

# Define the function to clean the text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[-:;()"\',]', ' ', text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    cleaned_words = [word for word in words if word not in stop_words and word != 'abstract']
    
    # Join the cleaned words back into a single string
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

In [None]:
def query_pubmed(title, authors, year, detailed=False):
    base_url = "https://pubmed.ncbi.nlm.nih.gov/api/citmatch/"
    if detailed:
        params = {
            "method": "auto",
            "raw-text": f"{title.replace(' ', '+')}+{authors.replace(' ', '+')}+{year}",
            "journal": "Pac Symp Biocomput",
            "retmode": "json"
        }
    else:
       params = {
            "method": "auto",
            "raw-text": title,
            "journal": "Pac Symp Biocomput",
            "retmode": "json"
        } 
            
    response = requests.get(base_url, params=params)
    data = response.json()
    # Extract PubMed ID from the response
    pubmed_ids = data.get('result', {}).get('uids', [])
    return pubmed_ids

# Clean the titles, query PubMed API, and store PubMed IDs
pubmed_results = []
for index, row in tqdm(df.iterrows()):
    title = row['Title']
    authors = row['Authors']
    year = row['Year']
    cleaned_title = clean_text(title)
    pubmed_ids = query_pubmed(cleaned_title, authors, year)
    if pubmed_ids == []:
        pubmed_ids = query_pubmed(cleaned_title, authors, year, True)
#         if pubmed_ids == []:
#             print(f"no record found for title: {cleaned_title}")
#             print(f"{cleaned_title.replace(' ', '+')}+{authors.replace(' ', '+')}+{year}")
#             print(f"{title.replace(' ', '+')}+{authors.replace(' ', '+')}+{year}")
    pubmed_results.append({'Title': title, "Authors": authors, "Year": year, "DOI": row['DOI'], 'PubMed IDs': pubmed_ids})

# Create a DataFrame with the results
new_results_df = pd.DataFrame(pubmed_results)

# Print the results
display(new_results_df)

In [None]:
pubmed_data = new_results_df
pubmed_data.insert(0, 'Original Title', '')

def get_journal_name_and_title(fetch, pubmed_id):
    article = fetch.article_by_pmid(pubmed_id)
    return (article.journal, article.title)

fetch = metapub.PubMedFetcher()
# Iterate through each row and check the PubMed IDs
for index, row in tqdm(pubmed_data.iterrows()):
    orig_title = row['Title']
    pubmed_ids = [item['pubmed'] for item in row['PubMed IDs']]
    for pubmed_id in pubmed_ids:
        journal_name, title = get_journal_name_and_title(fetch, pubmed_id)
        if journal_name == 'Pac Symp Biocomput':
            correct_pubmed_id = pubmed_id
            break
        # Be polite and avoid hitting the server too hard
        sleep(0.5)
    if correct_pubmed_id:
        pubmed_data.at[index, 'PubMed IDs'] = correct_pubmed_id
        pubmed_data.at[index, 'Title'] = title
        pubmed_data.at[index, 'Original Title'] = orig_title
    else:
        "no correct id found"
        
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(pubmed_data)

## Get Scopus IDs and Citation Data for each author

In scraping author names from the proceedings webpages, there were many unicode errors and formatting differences (suchas as using initials vs full names from year to year). When creating a co-authorship network, this would lead to ambiguity when determining which authors contributed to multiple papers. To disambiguate the data, queried the Scopus API using pybliometrics, an API wrapper for Scopus (https://pybliometrics.readthedocs.io/en/stable/index.html). For each paper, mapped the authors to their respective Scopus IDs, a unique identifier assigned to each author by Scopus. Used Scopus ID instead of ORCID IDs as not every author necessarily has an ORCID ID, but Scopus automatically assigns each author in its database an ID.

For citation network purposes, also got Scopus's citation counts for each paper (regarded as accurate by the academic community)

In [None]:
import pybliometrics
from pybliometrics.scopus import AbstractRetrieval
from pybliometrics.scopus import AuthorRetrieval
from pybliometrics.scopus.exception import Scopus404Error

In [None]:
pybliometrics.scopus.init() #includes Elsevier API Key

In [None]:
df = pubmed_data

In [None]:
def query_scopus_authors_cited(pmid):
    full_auth = {}
    ab = AbstractRetrieval(pmid)
    for author in ab.authors:
        id = author.auid
        au = AuthorRetrieval(id)
        full_auth[id] = f"{au.given_name} {au.surname}"
    return (full_auth, ab.citedby_count)

In [None]:
df['Full Authors'] = None
df['Cited By Count'] = None

default = "pmid"
# Iterate through each PubMed ID and query Scopus
for index, row in tqdm(df.iterrows()):
    pmid = row['PubMed IDs']
    doi = row['DOI']
    if default == "pmid":
        try:
            (full_authors, cited_by_count) = query_scopus_authors_cited(pmid)
        except Scopus404Error:
            print(f"Scopus404Error for PMID: {pmid}. Trying alternative method...")
            default = "doi"
            (full_authors, cited_by_count) = query_scopus_authors_cited(doi)
    else:
        (full_authors, cited_by_count) = query_scopus_authors_cited(doi)
    df.at[index, 'Full Authors'] = full_authors
    df.at[index, 'Cited By Count'] = cited_by_count

display(df)

In [None]:
# df.to_csv('full_author_results.csv', index=False)

## Mapping Topic Results from LDA

In [8]:
def concatenate_csv_files(folder_path, output_file):
    # List to hold all dataframes
    all_dataframes = []
    # Iterate over all CSV files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            # Read the CSV file
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            
            # Extract the year from the filename
            year = filename.split('.')[0]
            
            # Add the new column for filenames formatted as YYYY/NN_main_body.txt
            df['filename'] = df.apply(lambda row: f"{year}\\{int(row['Unnamed: 0'])}_main_body.txt", axis=1)
            
            # Append the dataframe to the list
            all_dataframes.append(df)
    # Concatenate all dataframes
    final_df = pd.concat(all_dataframes, ignore_index=True)

    # Save the combined dataframe to a new CSV file
    final_df.to_csv(output_file, index=False)

    return final_df

# Define the folder path and output file path
folder_path = 'Paper_CSV'
output_file = 'Titles_and_Filenames.csv'

# Call the function
combined_df = concatenate_csv_files(folder_path, output_file)

# Display the final dataframe
display(combined_df)

Unnamed: 0.1,Unnamed: 0,pdf,authors,titles,number,available,filename
0,0,https://psb.stanford.edu/psb-online/proceeding...,R.A. Goldstein and R.B. Altman,The Evolution of Biomolecular Structures and t...,0.0,True,1996\0_main_body.txt
1,1,https://psb.stanford.edu/psb-online/proceeding...,A. Keith Dunker and Richard H. Lathrop,"Discovering, Learning, Analyzing and Predictin...",1.0,True,1996\1_main_body.txt
2,2,https://psb.stanford.edu/psb-online/proceeding...,"Kiyoshi Asai, Tom Head, Katsumi Nitta and Taka...","Stochastic Models, Formal Systems and Algorith...",2.0,True,1996\2_main_body.txt
3,3,https://psb.stanford.edu/psb-online/proceeding...,Chris Henn and Michael Teschner,Interactive Molecular Visualization.,3.0,True,1996\3_main_body.txt
4,4,https://psb.stanford.edu/psb-online/proceeding...,"Steven M. Thompson, Susan J. Johns and A. Keit...",Educational Issues in Biocomputing.,4.0,True,1996\4_main_body.txt
...,...,...,...,...,...,...,...
1368,42,https://psb.stanford.edu/psb-online/proceeding...,"Michelle Holko, Nick Weber, Chris Lunt, Steven...",Biomedical research in the Cloud: Options and ...,42.0,True,2023\42_main_body.txt
1369,43,https://psb.stanford.edu/psb-online/proceeding...,"Anurag Verma, Jennifer Huffman, Ali Torkmani, ...",High-Performance Computing Meets High-Performa...,43.0,True,2023\43_main_body.txt
1370,44,https://psb.stanford.edu/psb-online/proceeding...,"Ruowang Li, Rui Duan, Lifang He, Jason H. Moore","Risk prediction: Methods, Challenges, and Oppo...",44.0,True,2023\44_main_body.txt
1371,45,https://psb.stanford.edu/psb-online/proceeding...,"Andrew Gentles, Ajit Nirmal, Laura Heiser, Emm...",Single Cell Spatial Biology for Precision Canc...,45.0,True,2023\45_main_body.txt


In [20]:
lda_topics_documents = pd.read_csv('LDA_topics_documents.csv')
combined_topic_data = pd.read_csv('Titles_and_Filenames.csv')
full_author_results = pd.read_csv('full_author_results.csv', encoding='ISO-8859-1')

# Merge LDA_topics_documents with combined_topic_data based on document and filename
merged_lda = pd.merge(lda_topics_documents, combined_topic_data, left_on='document', right_on='filename', how='left')

In [21]:
combined_topic_data.rename(columns={'titles': 'Original Title'}, inplace=True)
merged_data = pd.merge(full_author_results, combined_topic_data, on='Original Title', how='left')

augmented_data = pd.merge(merged_data, lda_topics_documents, left_on='filename', right_on='document', how='left')

topic_distributions = augmented_data[['Original Title', 'distr']]

# Merge this new dataframe back with full_author_results to preserve all rows
final_full_author_results = pd.merge(full_author_results, topic_distributions, on='Original Title', how='left')

display(final_full_author_results)

# final_full_author_results.to_csv('PSB_Paper_Data.csv', index=False)


Unnamed: 0,Original Title,Title,Authors,Year,DOI,PubMed IDs,Full Authors,distr
0,Protein structure comparison using representat...,Protein structure comparison using representat...,Tatsuya Akutsu and Hiroshi Tashimo,1996,,9390221,"{7102080520: 'Tatsuya Akutsu', 7801453395: 'H....",
1,Quaternion Contact Ribbons: a New Tool for Vis...,Quaternion contact ribbons: a new tool for vis...,"Kurt Albrect, John Hart, Alex Shaw and A. Keit...",1996,,9390222,"{7103391181: 'K. Albrecht', 55243117900: 'John...",[5.2092910e-01 3.2582842e-02 3.0846679e-01 7.9...
2,Fast Protein Fold Recognition via Sequence to ...,Fast protein fold recognition via sequence to ...,"Nickolai N. Alexandrov, Ruth Nussinov and Ralk...",1996,,9390223,"{7004299612: 'Nickolai N. Alexandrov', 3457217...",[0.35456887 0.0006435 0.12934364 0.0006435 0...
3,A Programming Course in Bioinformatics for Com...,A programming course in bioinformatics for com...,Russ B. Altman and John Koza,1996,,9390224,"{7202798518: 'Russ B. Altman', 55167951500: 'J...",[1.6253847e-01 2.6070837e-02 5.2382267e-01 3.9...
4,Massively Parallel Simulated Annealing Algorit...,Massively parallel algorithms for chromosome r...,"Suchendra M. Bhandarkar, Sridhar Chirravuri, J...",1996,,9390225,"{7006828161: 'Suchendra M. Bhandarkar', 650830...",[9.5343450e-03 1.9284661e-01 7.2745472e-02 6.3...
...,...,...,...,...,...,...,...,...
1271,Deconvolution of Nascent Sequencing Data Using...,Deconvolution of Nascent Sequencing Data Using...,"Zachary Maas, Rutendo Sigauke, Robin Dowelli",2024,10.1142/9789811286421_0043,38160307,"{57216789177: 'Zachary L. Maas', 57200620038: ...",[0.00071785 0.09369659 0.04105422 0.04378889 0...
1272,Splitpea: Quantifying Protein Interaction Netw...,Splitpea: quantifying protein interaction netw...,"Ruth Dannenfelser, Vicky Yao",2024,10.1142/9789811286421_0044,38160308,"{36145038300: 'Ruth Dannenfelser', 54381136200...",[4.4734727e-05 1.1675763e-02 2.1070056e-02 4.0...
1273,Lymphocyte Count Derived Polygenic Score and I...,Lymphocyte Count Derived Polygenic Score and I...,"Kathleen M. Cardone, Scott Dudek, Karl Keat, Y...",2024,10.1142/9789811286421_0045,38160309,"{57945000000: 'Kathleen M. Cardone', 700679054...",[5.0705270e-04 2.8118375e-03 4.6095698e-05 2.5...
1274,Polygenic Risk Scores for Cardiometabolic Trai...,Polygenic risk scores for cardiometabolic trai...,"Rachel Kember, Shefali Verma, Anurag Verma, Br...",2024,10.1142/9789811286421_0046,38160310,"{35722432500: 'Rachel L. Kember', 56386609500:...",[4.1469688e-05 3.3590449e-03 1.5385253e-02 5.0...
