# Get Article citations information

In [1]:
from bs4 import BeautifulSoup
import requests
import math
import json
import re

In [2]:
article_id = '15486505894554216965'
url_generic = f'https://scholar.google.com/scholar?start={{}}&hl=en&cites={article_id}'
NUM_ARTICLES_BY_PAGE = 10

url = url_generic.format(0)
print(url)
cookie = {
    'NID': '',
    'GSP': ''
}

def get_soup(url):
    html = requests.get(url, cookies=cookie).text
    return BeautifulSoup(html)

soup = get_soup(url)

https://scholar.google.com/scholar?start=0&hl=en&cites=15486505894554216965


In [3]:
def get_num_citations():
    return int(soup.find(id='gs_ab_md').div.text.split()[1])

def get_num_pages(num_citations):
    return math.ceil(num_citations/NUM_ARTICLES_BY_PAGE)

num_citations = get_num_citations()
num_pages = get_num_pages(num_citations)

In [4]:
raw_articles_data = []

for page_num in range(num_pages):
    url = url_generic.format(page_num * NUM_ARTICLES_BY_PAGE)
    soup = get_soup(url)
    gs_rs = soup.find_all('div', 'gs_r gs_or gs_scl')

    raw_articles_data.extend(gs_rs)

## Validate Data collection

In [5]:
assert num_citations == len(raw_articles_data)

## Parse Articles info

In [6]:
def get_title(article):
    return article.find('h3', 'gs_rt').text

def get_article_url(article):
    article_url_tag = article.find('h3', 'gs_rt').a
    return article_url_tag['href'] if article_url_tag else None

def get_authors(article):
    return ', '.join(author.text for author in article.find('div', 'gs_a').find_all('a'))

def get_year(article):
    year = re.findall("\d{4}", article.find('div', 'gs_a').text)
    return int(year[-1]) if year else None

def get_article_description(article):
    return article.find('div', 'gs_rs').text

def get_num_citations(article):
    citations_info = article.find('div', 'gs_fl gs_flb').find_all('a')[2].text
    valid_num = citations_info.startswith('Cited')
    return  int(citations_info.split()[-1]) if valid_num else 0
    
def get_file_info(article):    
    file_url = file_type = publisher = None
    article_file = article.find('div', 'gs_or_ggsm')
    
    if article_file:
        file_url = article_file.a['href']
        file_type, publisher =  article_file.a.text.split()
        file_type = file_type[1:-1].lower()

    file_info = {
        'file_url': file_url,
        'file_type': file_type,
        'publisher': publisher}
    return file_info

def get_article_data(article):

    file_info = get_file_info(article)
    
    data = {
        'title': get_title(article),
        'article_url': get_article_url(article),
        'authors': get_authors(article),
        'year': get_year(article),
        'article_description': get_article_description(article),
        'num_citations': get_num_citations(article),
        'file_url': file_info['file_url'],
        'file_type': file_info['file_type'],
        'publisher': file_info['publisher'],
    }
    return data

In [9]:
articles_data = []

for article in raw_articles_data:
    articles_data.append(get_article_data(article))

## Save data

In [10]:
with open("../data_files/article_citations.json", "w") as f:
    json.dump(articles_data, f, indent=4)