# Scopus missing citations locator
This notebook can help you to find missing citations on Scopus that are indexed on Google Scholar.

In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import pandas as pd
import re
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
options = webdriver.ChromeOptions()
options.add_argument("--incognito")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)

In [None]:
"""
    Returns the publications of a specific author on Scholar
    driver: selenium driver
    author_id: id of the author
    wait_timeout: timeout to load all the articles
"""
def getScholarArticles(driver, author_id, wait_timeout=10):
    pub_scholar = []
    pub_id = 0
    
    #Load the page of the author
    driver.get("http://scholar.google.it/citations?user="+author_id)
    #Locate the button "more" to get all the publications
    btnMore = driver.find_element(By.ID, 'gsc_bpf_more')
    #Load all the publications
    while btnMore.is_enabled():
        #Click on the button "more"
        btnMore.click()
        #Wait until the page is loaded
        time.sleep(wait_timeout)

    #Foreach row in the table
    for tr in driver.find_elements(By.XPATH, "//tr[@class='gsc_a_tr']"):
        pub = dict()
        #Extract the publication info
        td_info = tr.find_element(By.XPATH, "./td[@class='gsc_a_t']")
        tmp = td_info.find_elements(By.XPATH, "./div[@class='gs_gray']")
        pub['pub_id'] = pub_id
        pub_id += 1
        pub['title'] = td_info.find_element(By.XPATH, "./a[@class='gsc_a_at']").text
        pub['authors'] = tmp[0].text
        pub['venue'] = tmp[1].text
        
        #Extract the citations
        pub['num_cit'] = 0
        #URL to articles that cite this page
        pub['cit_url'] = ""

        try:
            td_n_cit = tr.find_element(By.XPATH, "./td[@class='gsc_a_c']")
            a = td_n_cit.find_element(By.TAG_NAME, "a")
            pub['cit_url'] = a.get_attribute('href')
            pub['num_cit'] = a.text
        except:
            pass

        pub_scholar.append(pub)
        pass
    return pub_scholar

In [None]:
done = []
citations = []
"""
Given a dataframe of publications generated with getScholarArticles
generates the list of their citations
"""
def extractCitationsScholar(driver, pub_scholar):
    global done
    global citations
    
    #For each publication
    for pub in pub_scholar:
        if pub['pub_id'] not in done:
            if len(pub["cit_url"]) > 0:
                #Open the main page related to citations
                driver.get(pub["cit_url"])
                #Get the links of the subpages
                pages = []
                try:
                    pages = [a.get_property('href') for a in driver.find_element(By.XPATH, "//div[@id='gs_nml']").find_elements(By.XPATH, "./a[@class='gs_nma']")]
                except:
                    pass
                #For each subpage, starting by the first one
                i = 0
                while i < len(pages)+1:
                    #Extracts the citations for the current page
                    for cit in driver.find_element(By.XPATH, "//div[@id='gs_res_ccl_mid']").find_elements(By.XPATH, "//div[@class='gs_r gs_or gs_scl']"):
                        citation = dict()
                        citation['pub_id'] = pub['pub_id']
                        try:
                            citation['cit_name'] = cit.find_element(By.XPATH, "./div[@class='gs_ri']/h3/a").text    
                            info = cit.find_element(By.XPATH, "./div[@class='gs_ri']/div[@class='gs_a']").text
                            citation['cit_authors'] = info.split('-')[0]
                            citation['cit_venue'] = ' - '.join(info.split('-')[1:])
                            cit.find_element(By.XPATH, "//a[@aria-controls='gs_cit']").click()
                            time.sleep(0.2)
                            citation['bibtex_url'] = driver.find_element(By.XPATH, '//a[@class="gs_citi"]').get_attribute('href')
                            time.sleep(0.2)
                            driver.find_element(By.XPATH, '//a[@id="gs_cit-x"]').click()
                            time.sleep(0.2)
                        except:
                            citation['cit_text'] = cit.text
                        citations.append(citation)
                    #Navigate to the next page
                    if i < len(pages):
                        driver.get(pages[i])
                    i += 1
            done.append(pub['pub_id'])
    return citations

In [None]:
"""
Compute the Jaccard similarity of two strings
"""
def js(s1, s2):
    s1 = set(re.split('\W+', str(s1).lower().strip()))
    s2 = set(re.split('\W+', str(s2).lower().strip()))
    com = float(len(s1.intersection(s2)))
    return com / (len(s1)+len(s2)-com)

# Step 1 - Extract articles and citations from Scholar

## Download all the articles from your Scholar profile
You can find your author_id in the URL of your Google Scholar page: is the code after '?user='.

For example, in the URL https://scholar.google.it/citations?user=lACV6IYAAAAJ the author_id is 'lACV6IYAAAAJ'.

In [None]:
#Author id
author_id = "lACV6IYAAAAJ"
#Returns the list of articles
pub_scholar = getScholarArticles(driver, author_id)

Store the articles in a CSV file

In [None]:
pd.DataFrame(pub_scholar).to_csv('pub_scholar.csv', index=False)

## Download the citations
Download all the citations of the previous extracted articles.

If Scholar asks if you are a robot, do the procedure, and then run the cell again, until all the citations are extracted.
It can happen multiple times.

In [None]:
citations_scholar = extractCitationsScholar(driver, pub_scholar)

Store the citations in a CSV file.

In [None]:
pd.DataFrame(citations_scholar).to_csv('cit_scholar.csv', index=False)

# Step 2 - Download your citations from Scopus

Go on your page on Scopus, then go on the 'Cited by XX documents' tab, and click on the arrow at the right of 'Export all to CSV File'.

![image](images/scopus.png)

A new window will open, select 'CSV' as export method, check 'include references', finally click on 'Export'.

![image](images/scopus2.png)

Scopus will generate a CSV file called 'scopus.csv' that must be placed in the same folder of this notebook.

# Step 3 - Looking for missing citations

This step tries to perform Entity Resolution on the citations looking for those that do not have a correspondence on Scopus.
The process is relatively raw and could be improved, but works quite well.

## Simplest approach
Matching only on the title of the citations. In short, it performs a similarity join between Scholar and Scopus citations, keeping only those from Scholar that do not have a correspondence in Scopus.

Every row of the output dataframe contains:
* **cit_name**: title of the missing citation;
* **cit_authors**: authors of the missing citation;
* **cit_venue**: venue of the missing citation;
* **title**: title of the article on Scopus on which the citation is missing.

In [None]:
# Scholar articles
pub_scholar = pd.read_csv('pub_scholar.csv')
# Scholar citations
cit_scholar = pd.read_csv('cit_scholar.csv')
# Scopus citations
cit_scopus = pd.read_csv('scopus.csv', sep=",")

# Add a unique identifier to each citation
cit_scholar = cit_scholar.reset_index().rename({'index':'scholar_id'}, axis=1)
cit_scopus = cit_scopus.reset_index().rename({'index':'scopus_id'}, axis=1)

# Create a pair "citation id - Title"
cit_scholar_1 = cit_scholar[['scholar_id', 'cit_name']]
cit_scopus_1 = cit_scopus[['scopus_id', 'Title']]

# Perform the cartesian product
cross = cit_scopus_1.merge(cit_scholar_1, how='cross')

# Compute the similarity between the titles
cross['sim'] = cross.apply(lambda x: js(x['Title'], x['cit_name']), axis=1)

# Mark as matches those with a similarity greater than 0.7
found = cross[cross['sim']>0.7]
matched_scholar = set(found['scholar_id'].values)

# Keep only those were not found
miss = cit_scholar[~cit_scholar['scholar_id'].isin(matched_scholar)]\
       .dropna(subset=["cit_name"])[['pub_id', 'cit_name', 'cit_authors', 'cit_venue']]

#Joins the citations with the articles to find which are citing
missing_citations = miss.merge(pub_scholar, how='inner')[['pub_id', 'cit_name', 'cit_authors', 'cit_venue', 'title']]

# Store the missing citations in a CSV file
missing_citations.to_csv('missing_simple.csv', sep=";")

missing_citations

## More accurate approach
Matching the title of the citation and the title of the cited article.

It is a similarity join on two conditions: JS (cited article title) > 0.5 AND JS (citation title)  0.7


Every row of the output dataframe contains:
* **cit_name**: title of the missing citation;
* **cit_authors**: authors of the missing citation;
* **cit_venue**: venue of the missing citation;
* **title**: title of the article on Scopus on which the citation is missing.

In [None]:
# Insert here your surname
author_surname = 'Gagliardelli'

# Scopus citations
df = pd.read_csv('scopus.csv', sep=",")

# Takes the list of the references (this field contains all the bibliography entries separated by a semicolon)
df['References'] = df['References'].str.split(';')

# Create a row for every citation
df1 = df.explode('References')

# Keep only those in which appear the specified surname
df2 = df1[df1['References'].str.contains(author_surname)]\
      [['Authors', 'Title', 'Year', 'Source title', 'DOI', 'References']]

# Add an identifier to each citation
df2 = df2.reset_index().rename({'index':'scopus_id'}, axis=1)

# Keep only: citation id, title, data of the cited article
cit_scopus = df2[['scopus_id', 'Title', 'References']]\
             .rename({'Title':'scopus_cit_title', 'References':'scopus_art_title'}, axis=1)

# Scholar citations
cit_scholar = pd.read_csv('cit_scholar.csv')

# Add a unique identifier
cit_scholar = cit_scholar.reset_index().rename({'index':'scholar_id'}, axis=1)
cit_scholar2 = cit_scholar.rename({'cit_name': 'scholar_cit_title'}, axis=1)

# Scopus publications
pub_scholar = pd.read_csv('pub_scholar.csv')

# For every publication generates a string similar to those of Scopus, with authors, title and venue
pub_scholar['scholar_art_title'] = pub_scholar['authors']+" "+pub_scholar['title']+" "+pub_scholar['venue']

# Add the data of the cited article
# As for scopus keeps only: citation id, title, data of the cited article
cit_scholar2 = cit_scholar2.merge(pub_scholar, how='inner')[['scholar_id', 'scholar_art_title', 'scholar_cit_title']]

# Perform the cartesian product
cross = cit_scopus.merge(cit_scholar2, how='cross')

# Computes the Jaccard Similarity between the title of the citations and the titles of the cited work
cross['sim_art'] = cross.apply(lambda x: js(x['scopus_art_title'], x['scholar_art_title']), axis=1)
cross['sim_cit'] = cross.apply(lambda x: js(x['scopus_cit_title'], x['scholar_cit_title']), axis=1)

# Keep as matches the citations in which the titles of the articles have a js > 0.5 and
# the titles of the cited articles have a JS > 0.7
found = cross[(cross['sim_art']>0.5) & (cross['sim_cit']>0.7)]

# Extract the ids of solved citations
matched_scholar = set(found['scholar_id'].values)

# Removes solved citations (i.e. keep the missing on scopus)
miss = cit_scholar[~cit_scholar['scholar_id'].isin(matched_scholar)]\
       .dropna(subset=["cit_name"])[['pub_id', 'cit_name', 'cit_authors', 'cit_venue']]

# Add the title of cited article
missing_citations = miss.merge(pub_scholar, how='inner')[['pub_id', 'cit_name', 'cit_authors', 'cit_venue', 'title']]

# Store the result in a csv file
missing_citations.to_csv('missing_accurate.csv', sep=";")
missing_citations