In [214]:
import scrapy
import selenium
import pandas as pd
import numpy as np
import requests
import re
from itertools import combinations
from selenium import webdriver
from selenium.webdriver.common.by import By
from fuzzywuzzy import process

# Helper Functions

These functions are used to extract and clean the data from the website and are mainly focused on extracting the authors names and affiliations as well as establishing the relationship between the two. Unlike the pdf parsing functions, this one allow the clean extraction of affiliations and its relationship with every author. 

***

This function takes a scrapy selector object and returns a list containing two lists, one for the poster authors and one for the poster affiliations.

In [208]:
def extraccion_autores(scrapy_sel):
    nueva_lista_autores = []
    lista_afiliaciones = []
    autores_sel = scrapy_sel.xpath('//ul[@class="elementor-icon-list-items elementor-inline-items"]')[1]
    autores_lst = autores_sel.xpath('*//span/text()').extract()
    nombre_autor = re.compile('\D*[^¹²³⁴⁵⁶⁷⁸⁹𝄒1-9]')
    num_afiliacion = re.compile('[¹²³⁴⁵⁶⁷⁸⁹]')
    for a_i in range(len(autores_lst)):
        match_nombre = nombre_autor.findall(autores_lst[a_i])
        match_afil = num_afiliacion.findall(autores_lst[a_i])
        if len(match_nombre) > 1:
            print(f'Error en {match_nombre}')
            break
        nombre_sin_num = match_nombre[0].strip()
        nombre_sin_coma = nombre_sin_num.replace(',','')
        nombre_sin_w = nombre_sin_coma.strip()
        nueva_lista_autores.append(nombre_sin_w)
        lista_afiliaciones.append(match_afil)
    return([nueva_lista_autores, lista_afiliaciones])

This function takes a scrapy Selector object, a list of authors and a list of affiliations and returns a pandas data frame object containing the poster authors and affiliations. 

In [220]:
def extraccion_afiliaciones(scrapy_sel, aut, afil):
    afil_lst = scrapy_sel.xpath('//ul[@class="elementor-icon-list-items"]//span/text()').extract()
    afil_num = re.compile('\d')
    afil_name = re.compile('\D+')
    afil_dic = {}
    for a_f in afil_lst:
        afil_dic[int(afil_num.search(a_f).group())] = afil_name.search(a_f).group()
    df_autor_y_afil = pd.DataFrame(
    data = {'autor':aut, 
            'afiliacion': afil}
            )
    df_autor_y_afil['afiliacion'] = df_autor_y_afil['afiliacion'].apply(lambda x: afil_fun(x, afil_dic))
    return(df_autor_y_afil)

This function is used by the 'extraccion_afiliaciones' function to get rid of superscript numbers in the affiliations.

In [210]:
def afil_fun(row, afil_dic):

    superscript_dic = {
        chr(185): 1,
        chr(178): 2,
        chr(179): 3,
        chr(8308): 4,
        chr(8309): 5,
        chr(8310): 6,
        chr(8311): 7,
        chr(8312): 8,
        chr(8313): 9
    }

    if len(row) > 1:
        res_lng = []
        for af in row:
            super_to_num = superscript_dic[af]
            afil_ref = afil_dic[super_to_num]
            res_lng.append(afil_ref)
        return(res_lng)
    else:
        super_to_num = superscript_dic[row[0]]
        res_sht = afil_dic[super_to_num]
        return(res_sht)

# Web Scraping

The following code executes the scraping itself and it is encapsulated within a function in the extraction.py file. 

First generate an empty pandas data frame to hold the results from the scraping process.  
Second we instatiate the driver with selenium using firefox (which requires geckodriver). We had to use selenium to simulate a browser environment so that some of the required code from the website could run and thus showed the appropiate output to scrape.  
Third we open the SAN 2020 website in the driver and extract the source code from the website to construct a scrapy Selector object from where we can extract the urls that contain all the posters.  

Then we loop over all the poster urls while adding a conditional statement to check it is indeed a poster url before opening the url in the driver. After this conditional statement has been evaluated to True, the driver is redirected to the poster url for that iteration after which the source code for the new website is extracted and used to create a scrapy Selector object.  
Still within the loop, from this object we can extract the poster title, topic, authors and affiliations. We deliver the extracted authors and affiliations to one of the helper functions that generates a data frame with both elements and then we add a column for the poster title and topic to said data frame as well as an indicator variable for the first author and the resulting data frame is appended to the final dataframe.

Once the loop has finished the results are stored in a csv file. Then we extract the ids for the posters topics and the matching name for every poster topic. After cleaning the resulting strings we generate a dictionary with the poster topic id as key and the corresponding name as value. 

In [None]:
df_autores = pd.DataFrame(columns = ['autor', 'afiliacion', 'titulo','tema', 'primer_autor'])

san2020 = r'https://san2020.saneurociencias.org.ar/epostersbytopics/'
driver = webdriver.Firefox()
driver.get(san2020)
san_sel = scrapy.Selector(text=driver.page_source)
todos_url = san_sel.xpath('//div[@class="elementor-row"]//a/@href').extract()

for u in todos_url:
    if 'san2020.saneurociencias.org.ar/posters/' in u:    
        driver.get(u)
        poster_sel = scrapy.Selector(text = driver.page_source)
        titulo_poster = poster_sel.xpath('//title/text()').extract()[0]
        titulo_poster = titulo_poster.replace('– SAN2020','').strip()
        tema_poster = poster_sel.xpath('//div[@class="elementor-element elementor-element-d3567ab elementor-column elementor-col-50 elementor-top-column"]//h2[@class="elementor-heading-title elementor-size-default"]/a[@rel="tag"]/text()').extract()[0]
        if tema_poster == poster_sel.xpath('//h2[@class="elementor-heading-title elementor-size-default"]/a[@rel="tag"]/text()').extract()[1]:
            print('Extraccion de tema OK.')
        else:
            print(f'Inconsistencias en la extraccion del tema para poster:\n{titulo_poster}')
        autores_afil = extraccion_autores(poster_sel)
        autores,afiliaciones = autores_afil[0], autores_afil[1]
        aut_afil = extraccion_afiliaciones(poster_sel, autores, afiliaciones)
        aut_afil['titulo'] = titulo_poster
        aut_afil['tema'] = tema_poster
        primer_autor = [False] * (len(autores) - 1)
        primer_autor = primer_autor.insert(0, True)
        aut_afil['primer_autor'] = primer_autor
        df_autores = df_autores.append(
            aut_afil
        )

df_autores.to_csv('../SAN_csv/SAN_2020_autores.csv')

topics_id = san_sel.xpath('//div[@class="elementor-text-editor elementor-clearfix"]/p/a/@href').extract()

topics_name = san_sel.xpath('//div[@class="elementor-text-editor elementor-clearfix"]/p/a/text()').extract()

topics_id = [top.replace('#','') for top in topics_id]

topic_dic = {}
for i in range(len(topics_id)):
    topic_dic[topics_id[i]] = topics_name[i]