In [214]:
import scrapy
import selenium
import pandas as pd
import numpy as np
import requests
import re
from itertools import combinations
from selenium import webdriver
from selenium.webdriver.common.by import By
from fuzzywuzzy import process

# Helper Functions

These functions are used to extract and clean the data from the website and are mainly focused on extracting the authors names and affiliations as well as establishing the relationship between the two. Unlike the pdf parsing functions, this one allow the clean extraction of affiliations and its relationship with every author. 

***

This function takes a scrapy selector object and returns a list containing two lists, one for the poster authors and one for the poster affiliations.

In [208]:
def extraccion_autores(scrapy_sel):
    #Lista vacia para contener los nombres de los autores depurados
    nueva_lista_autores = []
    #Lista vacia para contener los numeros de las afiliaciones de cada autor
    lista_afiliaciones = []
    #Selector de scrapy que contiene los nombres de los autores
    autores_sel = scrapy_sel.xpath('//ul[@class="elementor-icon-list-items elementor-inline-items"]')[1]
    #Extraccion de los nombres de los autores como una lista
    autores_lst = autores_sel.xpath('*//span/text()').extract()
    #Creacion de los regex para extraer los nombres y las afiliaciones
    nombre_autor = re.compile('\D*[^¹²³⁴⁵⁶⁷⁸⁹𝄒1-9]')
    num_afiliacion = re.compile('[¹²³⁴⁵⁶⁷⁸⁹]')
    #Iteracion por cada uno de los nombres de los autores tal como figuran en la pagina
    for a_i in range(len(autores_lst)):
        #Match del nombre del autor
        match_nombre = nombre_autor.findall(autores_lst[a_i])
        #Match de la afiliacion
        match_afil = num_afiliacion.findall(autores_lst[a_i])
        #Chequeo que el regex no haya encontrado mas que un solo elemento (un solo nombre)
        if len(match_nombre) > 1:
            print(f'Error en {match_nombre}')
            break
        #Remocion de los numeros en formato superscript
        nombre_sin_num = match_nombre[0].strip()
        #Remocion de las comas
        nombre_sin_coma = nombre_sin_num.replace(',','')
        #Remocion de los whitespaces
        nombre_sin_w = nombre_sin_coma.strip()
        #Appendeado del nuevo nombre
        nueva_lista_autores.append(nombre_sin_w)
        #Appendeado de la afiliacion
        lista_afiliaciones.append(match_afil)
    return([nueva_lista_autores, lista_afiliaciones])

This function takes a scrapy Selector object, a list of authors and a list of affiliations and returns a pandas data frame object containing the poster authors and affiliations. 

In [220]:
def extraccion_afiliaciones(scrapy_sel, aut, afil):
    #Extraccion de las instituciones que conforman las afiliaciones
    afil_lst = scrapy_sel.xpath('//ul[@class="elementor-icon-list-items"]//span/text()').extract()
    #Generacion de regex para separar los numeros del nombre de la institucion
    afil_num = re.compile('\d')
    afil_name = re.compile('\D+')
    #Diccionario para guardar las relacion entre el numero y la afiliacion
    afil_dic = {}
    #Iteracion por cada afiliacion extraida
    for a_f in afil_lst:
        #Asigno el numero de la afiliacion como key del diccionario y el nombre como value
        afil_dic[int(afil_num.search(a_f).group())] = afil_name.search(a_f).group()
    #Dataframe, cada fila contiene el autor y su afiliacion
    df_autor_y_afil = pd.DataFrame(
    data = {'autor':aut, 
            'afiliacion': afil}
            )
    #Convierto el superscript de la fila de afiliacion en un numero convirtiendo de lista a str y usando el str
    #como key del diccionario para obtener el numero (value)
    df_autor_y_afil['afiliacion'] = df_autor_y_afil['afiliacion'].apply(lambda x: afil_fun(x, afil_dic))
    return(df_autor_y_afil)

This function is used by the 'extraccion_afiliaciones' function to get rid of superscript numbers in the affiliations.

In [210]:
def afil_fun(row, afil_dic):

    superscript_dic = {
        chr(185): 1,
        chr(178): 2,
        chr(179): 3,
        chr(8308): 4,
        chr(8309): 5,
        chr(8310): 6,
        chr(8311): 7,
        chr(8312): 8,
        chr(8313): 9
    }
    #Chequeo de que la lista de la celda no tenga mas de un valor
    if len(row) > 1:
        res_lng = []
        #Itero por cada valor para reemplazar el numero de la referencia por el nombre de la institucion
        for af in row:
            #Convierto del caracter numerico superscript a un numero
            super_to_num = superscript_dic[af]
            #Obtengo el nombre de la institucion a partir del numero
            afil_ref = afil_dic[super_to_num]
            #Apendeo a la lista vacia
            res_lng.append(afil_ref)
        return(res_lng)
    else:
        #Convierto del caracter numerico superscript a un numero 
        super_to_num = superscript_dic[row[0]]
        #Obtengo el nombre de la institucion a partir del numero
        res_sht = afil_dic[super_to_num]
        return(res_sht)

# Web Scraping

The following code executes the scraping itself and it is encapsulated within a function in the extraction.py file. 

First generate an empty pandas data frame to hold the results from the scraping process.  
Second we instatiate the driver with selenium using firefox (which requires geckodriver). We had to use selenium to simulate a browser environment so that some of the required code from the website could run and thus showed the appropiate output to scrape.  
Third we open the SAN 2020 website in the driver and extract the source code from the website to construct a scrapy Selector object from where we can extract the urls that contain all the posters.  

Then we loop over all the poster urls while adding a conditional statement to check it is indeed a poster url before opening the url in the driver. After this conditional statement has been evaluated to True, the driver is redirected to the poster url for that iteration after which the source code for the new website is extracted and used to create a scrapy Selector object.  
Still within the loop, from this object we can extract the poster title, topic, authors and affiliations. We deliver the extracted authors and affiliations to one of the helper functions that generates a data frame with both elements and then we add a column for the poster title and topic to said data frame as well as an indicator variable for the first author and the resulting data frame is appended to the final dataframe.

Once the loop has finished the results are stored in a csv file. Then we extract the ids for the posters topics and the matching name for every poster topic. After cleaning the resulting strings we generate a dictionary with the poster topic id as key and the corresponding name as value. 

In [221]:
df_autores = pd.DataFrame(columns = ['autor', 'afiliacion', 'titulo','tema', 'primer_autor'])

#2020 Meeting URL which holds all the posters
san2020 = r'https://san2020.saneurociencias.org.ar/epostersbytopics/'
#Driver (browser) startup
driver = webdriver.Firefox()
#Redirect the driver to the 2020 Meeting URL
driver.get(san2020)
#Generate the scrapy Selector object using the source code from the website
san_sel = scrapy.Selector(text=driver.page_source)
#Lista con todas las url que contiene la pagina relativas a los posters
todos_url = san_sel.xpath('//div[@class="elementor-row"]//a/@href').extract()

#Iteracion por cada url de la lista
for u in todos_url:
    #Chequeo de que se trate de una url a un poster
    if 'san2020.saneurociencias.org.ar/posters/' in u:    
        #Redireccion del driver a la pagina del poster
        driver.get(u)
        #Generacion del Selector scrapy con el codigo de la pagina del poster
        poster_sel = scrapy.Selector(text = driver.page_source)
        #Extraccion del titulo del poster
        titulo_poster = poster_sel.xpath('//title/text()').extract()[0]
            #Remocion de elementos superfluos del titulo del poster
        titulo_poster = titulo_poster.replace('– SAN2020','').strip()
        #Extraccion del tema del poster 
        tema_poster = poster_sel.xpath('//div[@class="elementor-element elementor-element-d3567ab elementor-column elementor-col-50 elementor-top-column"]//h2[@class="elementor-heading-title elementor-size-default"]/a[@rel="tag"]/text()').extract()[0]
            #Chequeo de correcta extraccion del tema del poster usando otro metodo alternativo de extraccion
        if tema_poster == poster_sel.xpath('//h2[@class="elementor-heading-title elementor-size-default"]/a[@rel="tag"]/text()').extract()[1]:
            print('Extraccion de tema OK.')
        else:
            print(f'Inconsistencias en la extraccion del tema para poster:\n{titulo_poster}')
        #Extraccion de los autores y las afiliaciones
        autores_afil = extraccion_autores(poster_sel)
        #La primera lista contiene los autores y la segunda las afiliaciones
        autores,afiliaciones = autores_afil[0], autores_afil[1]
        #Dataframe con los nombres de los autores y los nombres de las afiliaciones
        aut_afil = extraccion_afiliaciones(poster_sel, autores, afiliaciones)
            #Agregado de columna con el titulo y el tema del poster
        aut_afil['titulo'] = titulo_poster
        aut_afil['tema'] = tema_poster
            #Agregado de columna para definir si es el primer autor
                #False para todos los autores menos el primero
        primer_autor = [False] * (len(autores) - 1)
                #Agrego True para el primer autor
        primer_autor = primer_autor.insert(0, True)
                #Agrego la lista a la columna de primer autor
        aut_afil['primer_autor'] = primer_autor
        #Agregado de los resultados al dataframe de autores final
        df_autores = df_autores.append(
            aut_afil
        )

df_autores.to_csv('../SAN_csv/SAN_2020_autores.csv')

#Extraccion de los IDs de los topicos de los posters
topics_id = san_sel.xpath('//div[@class="elementor-text-editor elementor-clearfix"]/p/a/@href').extract()

#Extraccion de los nombres correspondientes a cada ID de los topicos de los posters
topics_name = san_sel.xpath('//div[@class="elementor-text-editor elementor-clearfix"]/p/a/text()').extract()

#Remocion del # de cada string de la lista
topics_id = [top.replace('#','') for top in topics_id]

#Creacion de diccionario con los topics id como key y los nombres del topic como value
topic_dic = {}
for i in range(len(topics_id)):
    topic_dic[topics_id[i]] = topics_name[i]

Extraccion de tema OK.
Poster:Assessing the distribution of tanycyte processes and their vascular contacts within the basal hypothalamus of mice
Tema:Cellular and Molecular Neurobiology
Autores:['Maia Uriarte', 'Tomás Crespo', 'Mirta Reynaldo', 'Paula Reggiani', 'Rodolfo Goya', 'Mario Perelló', 'Pablo Nicolás De Francesco']

Extraccion de tema OK.
Poster:Chemical-LTP induce changes in the acetylation state of synaptic protein and PSD95 clustering in cultures of mice hippocampal neurons
Tema:Cellular and Molecular Neurobiology
Autores:['Sebastián Rivas', 'María Florencia Acutain', 'Verónica Báez', 'Ramiro Freudenthal']

Extraccion de tema OK.
Poster:Exogenous ketone bodies ameliorate neurodevelopmental defects associated with daf-18/PTEN mutations
Tema:Cellular and Molecular Neurobiology
Autores:['Sebastián Giunti', 'Pamela Azcona', 'María José De Rosa', 'Diego Rayes']

Extraccion de tema OK.
Poster:Genetic risk factors for Alzheimer’s Disease in adults with Down Syndrome in the Argenti

Extraccion de tema OK.
Error en [', Felipe Court ', ',', ',']
Poster:Mesenchymal stem cell-derived Schwann cell exosomes promote neurite outgrowth and axonal protection in vitro.
Tema:Cellular and Molecular Neurobiology
Autores:['Paula Andrea Soto', 'Rodrigo López Leal', 'Patricia Setton-Avruj']

Extraccion de tema OK.
Poster:Mitochondrial function in mouse brain cortex synaptosomes during aging. Alterations in motor performance
Tema:Cellular and Molecular Neurobiology
Autores:['Paulina Lombardi', 'Analía G Karadayian', 'Juan I. Guerra', 'Rodolfo Cutrera', 'Juanita Bustamante']

Extraccion de tema OK.
Poster:Neuropilin-2 (NRP2) but not neuropilin-1 (NRP1) regulates sympathetic axon outgrowth inhibition by estrogenized myometrial explants in 3D co-cultures
Tema:Cellular and Molecular Neurobiology
Autores:['Analía Richeri', 'Gaby Martínez', 'Gabriela Vierci', 'Ileana Sosa', 'Mónica Brauer', 'Alfonso Taboada']

Extraccion de tema OK.
Poster:Quantitative analysis of the neuroprotective eff

Extraccion de tema OK.
Poster:6-OHDA-induced dopaminergic neuron degeneration exacerbates anxiety-related behaviors in BDNFMet/Met mice
Tema:Cognition, Behavior, and Memory
Autores:['Constanza Milena Jandar Paz', 'Agustín Anastasía']

Extraccion de tema OK.
Poster:α-MSH modulation of the impairment in contextual fear memory induced by High-fat diet consumption; possible glial changes.
Tema:Cognition, Behavior, and Memory
Autores:['Guadalupe Herrera', 'Mercedes Lasaga', 'Teresa Scimonelli']

Extraccion de tema OK.
Poster:Understanding a complex task: An exploratory study using a web‐based Trail Making Test digital version
Tema:Cognition, Behavior, and Memory
Autores:['Gustavo Ezequiel Juantorena', 'Ignacio Linari', 'Agustín Petroni', 'Juan Esteban Kamienkowski']

Extraccion de tema OK.
Poster:Synaptic output of dopaminergic neurons only modulate positively contextual memory in Drosophila
Tema:Cognition, Behavior, and Memory
Autores:['Melina Petrissans', "Carolina D'Odorico", 'Micaela Vi

Extraccion de tema OK.
Poster:Motor-language coupling in bilinguals: Insights from a keyboard writing task in L1 and L2
Tema:Cognition, Behavior, and Memory
Autores:['Boris Kogan', 'Enrique García-Marco', 'Agustina Birba', 'Camila Cortés', 'Margherita Melloni', 'Agustín Ibáñez', 'Adolfo García']

Extraccion de tema OK.
Poster:Learning and memory in Drosophila melanogaster during predation exposure, by the spider Menemerus semilimbatus
Tema:Cognition, Behavior, and Memory
Autores:['Christian Carpio Romero', 'Lia Frenkel', 'Ramiro Freudenthal']

Extraccion de tema OK.
Poster:Functional connectivity of the retrosplenial cortex in object recognition memory formation
Tema:Cognition, Behavior, and Memory
Autores:['Ana Belén de Landeta', 'Magdalena Pereyra', 'Magdalena Miranda', 'Pedro Bekinschtein', 'Jorge H Medina', 'Cynthia Katche']

Extraccion de tema OK.
Poster:Familiarity and Confidence Could Prime Complex Decision Making With Social Implications
Tema:Cognition, Behavior, and Memory
Aut

Extraccion de tema OK.
Poster:Analysis of temporal structures using MRI, VBM and their histological correlation in patients with surgical resistant temporal epilepsy.
Tema:Disorders of the Nervous System
Autores:['Ernesto Gonzalez Stivala', 'Hernán Kulsgaard', 'Micaela Sanzo', 'Mariana Bendersky', 'Juan Pablo Princich', 'Lucía Alba Ferrara', 'Ignacio Larrabide', 'Silvia Kochen', 'Fabián Loidl', "Luciana D'Alessio"]

Extraccion de tema OK.
Poster:Altered neurovisceral responses to social and cognitive stress  in hypertensive disease: a multidimensional approach
Tema:Disorders of the Nervous System
Autores:['Agustina Legaz', 'Adrián Yoris', 'Lucas Sedeño', 'Sofía Abrevaya', 'Miguel Martorell', 'Florencia Alifano', 'Adolfo M. García', 'Agustín Ibañez']

Extraccion de tema OK.
Poster:Action semantics and the motor system: A neuromodulatory study on Parkinson’s disease patients
Tema:Disorders of the Nervous System
Autores:['Mariano N. Díaz-Rivera', 'Diana M.A. Suárez-García', 'Agustina Birb

Extraccion de tema OK.
Poster:New insights into the range of behaviors induced by the dopamine D1 agonist SKF-38393 in normal and hemiparkinsonian mice
Tema:Neurochemistry and Neuropharmacology
Autores:['Liliana T. Tribbia', 'Juan E. Belforte', 'Gustavo Murer', 'Oscar S. Gershanik', 'Irene R.E. Taravini']

Extraccion de tema OK.
Poster:Long term effect of social isolation during adolescence on β-catenin levels as well as on anxiety: role of dopamine neurotransmission
Tema:Neurochemistry and Neuropharmacology
Autores:['Alejandrina Funes', 'Cintia N. Konjuh', 'Silvana B. Rosso', 'Alejandra M. Pacchioni']

Extraccion de tema OK.
Poster:Involvement of cannabinoid CB1 receptor in stress-induced enhancement of extracellular glutamate in nucleus accumbens core after extinction of cocaine-conditioned place preference
Tema:Neurochemistry and Neuropharmacology
Autores:['Andrea Susana Guzman', 'María Paula Avalos', 'Pia Valentina Euliarte', 'Marianela Adela Sanchez', 'Daiana Rigoni', 'Julieta Boe

Extraccion de tema OK.
Poster:Graph Theory tools for characterize Motor/Imaginary Movements in EEG
Tema:Sensory and Motor Systems
Autores:['Román Baravalle', 'Natalí Guisande', 'Monserrat Pallares', 'Federico Miceli', 'Mauro Granado', 'Fernando Montani']

Extraccion de tema OK.
Poster:Motor replays of song during sleep in a suboscine bird
Tema:Sensory and Motor Systems
Autores:['Juan F. Döppler', 'Ana Amador', 'Franz Goller', 'Gabriel B. Mindlin']

Extraccion de tema OK.
Poster:Molecular mechanisms of cell death in a mouse model of progressive hearing loss
Tema:Sensory and Motor Systems
Autores:['Camila Carignano', 'Marcela Vera', 'Leonardo Dionisio', 'Eugenio Aztiria', 'Ezequiel Rías', 'Guillermo Spitzmaul']

Extraccion de tema OK.
Poster:Innate and acquired bases of discrimination and generalization between odors
Tema:Sensory and Motor Systems
Autores:['Agustin Lara', 'Emiliano Marachlian', 'Fernando Locatelli']

Extraccion de tema OK.
Poster:EVIDENCE FOR ACTIVITY-DEPENDENT AXONAL AR

In [290]:
instituciones_afiliacion = poster_sel.xpath('//ul[@class="elementor-icon-list-items"]//span/text()')

In [337]:
spam = pd.DataFrame(
    data = {'autor':extraccion_autores(poster_sel)[0], 
            'afiliacion': extraccion_autores(poster_sel)[1]}
            )
spam['afiliacion'] = spam['afiliacion'].apply(lambda x: superscript_dic[x[0]])

In [365]:
poster_sel.xpath('//div[@class="elementor-element elementor-element-d3567ab elementor-column elementor-col-50 elementor-top-column"]//h2[@class="elementor-heading-title elementor-size-default"]/a[@rel="tag"]/text()').extract()[0]

'Cognition, Behavior, and Memory'

In [388]:
extraccion_afiliaciones(poster_sel, extraccion_autores(poster_sel)[0], extraccion_autores(poster_sel)[1])

Unnamed: 0,Autor1,Autor2,autor,afiliacion


In [393]:
spam = pd.DataFrame(combinations(extraccion_autores(poster_sel)[0], 2), columns = ['Autor1', 'Autor2'])

In [197]:
poster_sel = scrapy.Selector(text = driver.page_source)

In [244]:
superscript_dic.keys()

dict_keys(['¹', '²', '³', '⁴', '⁵', '⁶', '⁷', '⁸', '⁹'])

In [57]:
topic19.xpath('*//h2[@class="elementor-heading-title elementor-size-default"]/text()')

[<Selector xpath='*//h2[@class="elementor-heading-title elementor-size-default"]/text()' data='Chronobiology'>]

In [79]:
topic19.xpath('*//div[@class="elementor-inner"]//p/a/@href').extract()

['https://san2020.saneurociencias.org.ar/posters/mating-alters-the-function-of-circadian-clock/',
 'https://us04web.zoom.us/j/74656538778?pwd=NHVkcnpvcG0yaVU0K0FTT0kvVGgwUT09',
 'https://san2020.saneurociencias.org.ar/posters/exploring-the-roles-of-gaba-in-the-sleep-circuit-of-drosophila/',
 'https://meet.google.com/ocb-hyqb-pey',
 'https://san2020.saneurociencias.org.ar/posters/consequences-of-one-year-of-antarctic-isolation-on-sleep-chronotype-and-social-jetlag/',
 'https://us04web.zoom.us/j/71224331711?pwd=OWxMZ0hjV2t6Wkl6cERyTnQrdC9Mdz09']