In [2]:
import requests
import re
import pandas as pd
import time
from bs4 import BeautifulSoup

In [3]:
# just a placeohlder user-agent

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}

In [4]:
class URLFetchError(Exception):
    print("Failed to fetch URL. ", 'Status code:', Exception)
    pass

Failed to fetch URL.  Status code: <class 'Exception'>


In [5]:
# this function for the getting inforamtion of the web page
def get_paperinfo(paper_url):

    # download the page
    response = requests.get(paper_url, headers=headers)

    # check successful response
    if response.status_code != 200:
        raise URLFetchError(response.status_code)

    # parse using beautiful soup
    paper_doc = BeautifulSoup(response.text, 'html.parser')

    return paper_doc

# it will return the title of the paper


def get_papertitle(paper_tag):

    paper_names = []

    for tag in paper_tag:
        paper_names.append(tag.select('h3')[0].get_text())

    return paper_names

# it will return the number of citation of the paper


def get_citecount(cite_tag):
    cite_count = []
    for i in cite_tag:
        cite = i.text
        if i is None or cite is None:  # if paper has no citatation then consider 0
            cite_count.append(0)
        else:
            # its handle the None type object error and re use to remove the string " cited by " and return only integer value
            tmp = re.search(r'\d+', cite)
            if tmp is None:
                cite_count.append(0)
            else:
                cite_count.append(int(tmp.group()))

    return cite_count

# function for the getting link information


def get_link(link_tag):

    links = []

    for i in range(len(link_tag)):
        links.append(link_tag[i].a['href'])

    return links

# function for the getting autho , year and publication information


def get_author_year_publi_info(authors_tag):
    years = []
    publication = []
    authors = []
    for i in range(len(authors_tag)):
        authortag_text = (authors_tag[i].text).split()
        year = int(re.search(r'\d+', authors_tag[i].text).group())
        years.append(year)
        publication.append(authortag_text[-1])
        author = authortag_text[0] + ' ' + re.sub(',', '', authortag_text[1])
        authors.append(author)

    return years, publication, authors


def get_tags(doc):
    paper_tag = doc.select('[data-lid]')
    # cite_tag = doc.select('[title=Cite] + a')
    cite_tag = doc.select('a[href*="cites"]')
    link_tag = doc.find_all('h3', {"class": "gs_rt"})
    author_tag = doc.find_all("div", {"class": "gs_a"})

    return paper_tag, cite_tag, link_tag, author_tag

Read the database and find the citation counts for each of them.


In [7]:
def fetch_citations(names):
    citations = []
    for name in names:
        url = f'https://scholar.google.com/scholar?q={name}'
        # print(url)

        doc = get_paperinfo(url)

        if doc is not None:
            paper_tag, cite_tag, link_tag, author_tag = get_tags(doc)
            papername = get_papertitle(paper_tag)
            year, publication, author = get_author_year_publi_info(author_tag)
            cite = get_citecount(cite_tag)
            link = get_link(link_tag)

            if len(cite) == 1:  # if there is only one paper with the same name
                citations.append(cite[0])
            elif len(cite) > 1:  # if there are multiple papers with the same name
                # |TODO|: find the most relevant paper using other information
                citations.append(cite[0])
            else:
                citations.append(float('nan'))
        else:
            citations.append(float('nan'))

        time.sleep(5)

    return citations


# Read dataset
database = pd.read_csv('journal_rankings.csv')
names = database['title'].tolist()

# Fetch citations
citations = fetch_citations(names)

# Add citations to the dataset
database['citations'] = citations

https://scholar.google.com/scholar?q=The sensitivity of a malignant cell line to hyperthermia (42 degrees C) at low intracellular pH.
https://scholar.google.com/scholar?q=Anti-leukaemia activity as a bystander effect of graft-versus-host reactions.
https://scholar.google.com/scholar?q=The effect of aflatoxins on the incorporation of RNA and protein precursors by isolated hepatocytes.
https://scholar.google.com/scholar?q=Chemical injuries of the upper extremity.
https://scholar.google.com/scholar?q=[Mitosis in regenerating comb row and the double-nucleated cells of Ctenophora].
https://scholar.google.com/scholar?q=Circulating levels of prolactin in human breast cancer.
https://scholar.google.com/scholar?q=[Experience with the management of subtalar dislocation].


## Challanges (to be fixed in the code)

there are multiple problems can occure that sould be handled in the code:

1. The database is too large, we wait 5 seconds between each query, so it takes too long. We might need a proxy and change it every time.
2. Handling errors when the access to url is denied.
3. Handling situations when there are more than one paper for same title. Here we can use text similarity of other fields such as authors, abstract, etc.
4. Handling situation where the title is not found in google scholar. I thought "[", "]", and dots in title should make trouble but they were okay. more cases might happen like this in larger datasets and we should monitor such cases.
5. If it's very important to have the exact number of citations, we might want to double check it somehow. I am not sure that the number of citations is always correct. Also, we need to compare the title with the title in the database to make sure that we are getting the correct number of citations. Here I returned the first relevant query anyway.
6. Fallback mechanism: what if google scholar suddenly decided to change the UI? we need to have a backup plan. Also, monitoring and alerting in necessary if we are using this code in production.
7. Consider API access to other sources like PubMed and use google scholar as a fallback only.
8. Implementing unit tests so that when we deploy the code, we can be sure that it works as expected.


In [9]:
database

Unnamed: 0,rowid,title,abstract,journal,pmcid,pmid,pubdate,license,JournalQualityOverall,JournalQualityLongevity,citations
0,32586,The sensitivity of a malignant cell line to hy...,The postulate that low intracellular pH acts a...,British journal of cancer,2025171,9969,1976,CC BY,4.0,4.0,58
1,33725,Anti-leukaemia activity as a bystander effect ...,The production of graft-versus-host (GVH) reac...,British journal of cancer,2025092,5100,1976,CC BY,4.0,4.0,3
2,33726,The effect of aflatoxins on the incorporation ...,Hepatocytes prepared by a simplified enzymatic...,British journal of cancer,2025075,5101,1976,CC BY,4.0,4.0,9
3,46924,Chemical injuries of the upper extremity.,The prompt recognition and management (Tables ...,Major problems in clinical surgery,10132827,3697,1976,CC BY,,,6
4,52801,[Mitosis in regenerating comb row and the doub...,,Ontogenez,10439467,3754,1975,CC BY,,,1
5,56022,Circulating levels of prolactin in human breas...,Serum prolactin concentrations were measured b...,British journal of cancer,2024836,2274,1975,CC BY,4.0,4.0,61
6,57486,[Experience with the management of subtalar di...,"In 10 and 3 years resp., material of two surgi...","Magyar traumatologia, orthopaedia es helyreall...",10037170,3689,1976,CC BY,,,1
