In [1]:
import os
from bs4 import BeautifulSoup
import re
import pandas as pd
from iteration_utilities import duplicates
from tqdm import tqdm
import seaborn as sns
from itertools import repeat
import numpy as np
from collections import Counter
pd.options.mode.chained_assignment = None  # default='warn'

# Functions and import data

In [2]:
def removeFile(fileList, file): #remove file from list of files. file as string 
    if file in fileList:
        fileList.remove(file)
        
    return fileList  

In [3]:
speakerPath = "Files/InvitedSpeakers"
presenterPath = "Files/Presenters"
chairPath = "Files/ChairIndex"
pagesPath = "Files/Pages"
sessionPath = "Files/Sessions"

speakers = removeFile(os.listdir(speakerPath), '.DS_Store') #invited speakers for 2005-2023
presenters = removeFile(os.listdir(presenterPath),'.DS_Store')  #all presenters for 2005-2023
chair = removeFile(os.listdir(chairPath), '.DS_Store') #chair index for 2005-2023
pages = removeFile(os.listdir(pagesPath), '.DS_Store') #pages showing overview of sessions for 1994-2004
sessions = removeFile(os.listdir(sessionPath), '.DS_Store') #showing overview of subsessions for 1994-2004

In [4]:
def getSoupFromFile(path, file): #get soup from file
    soup = BeautifulSoup(open(path + '/' + file))
    
    return soup

In [5]:
def getYear(regex, file): #get year from name on file
    year = int(re.findall(regex, file)[0])
    
    return year

In [6]:
#import from the other notebook, instead of repeating it
def getText(soup, regex, find_type, attribute): #attribute as string
    text_list = []
    
    if attribute == 'text':
        for t in soup.find_all(find_type, string = re.compile(regex, re.IGNORECASE)):
            text_list.append(t.text)
    else:
        for t in soup.find_all(find_type, attrs = {attribute: re.compile(regex, re.IGNORECASE)}):
            text_list.append(t.text)
 
    return text_list

In [7]:
def reverseName(name): #reverse name, so first name is first
    if name == 'TBD, ':
        reversedName = name
    else:
        nameGroup = re.findall(r'(.*), (.*) ', name)[0] #first name and last name as group
        reversedName = nameGroup[1] + " " + nameGroup[0] #reverse name
        
    return reversedName

In [8]:
def speakerInfo(soup, year, dict_, index): #get name, university and title of invited speaker from file. Page type 1
    speakerInfo = soup.table.findAll('tr')
    for i in speakerInfo:
        inv_name = i.strong.text #invited speakers
        name = reverseName(inv_name) #name reversed, so first name is first
        university = getText(i, r'2', 'font', 'size')[0] #university/department
        presentationInfo = i.findAll('b')[0].text #info about presentation/abstract
        sessionTitle = re.findall(r'Session (.*) ', presentationInfo)[0] #get sessionTitle
        title = re.findall(r'\n(.*)', presentationInfo)[0] #get title of presentation/article
        dict_[index] = {'Year': year, 'Name': name, 'University': university, 'Title': title, 'SessionTitle': sessionTitle}
        index += 1
        
    return dict_, index

In [9]:
def getSessionTitle(soup): #for presenter, of file type 1
    titleText = soup.findAll('title')
    if titleText:
        title = re.findall(r'Event - (.*)', titleText[0].text)[0]
    else:
        title = 'Unknown'
        
    return title

In [10]:
def getSponsoringUnit(soup): #find sponsoring unit for sessions(speakers). Works for file type 1.
    findSponsor = soup.find(string = re.compile(r'Sponsoring'))  #compile sponsor
    if findSponsor:
        sponsor = re.findall(r':.*\n(.*)', findSponsor)[0]
    else:
        sponsor = 'None'
    return sponsor

In [11]:
def getSponsoringDict(soup, year, dict_, index): #get sponsoring unit as dict
    linkText = soup.find_all('a') #find text about link
    for t in linkText:
        sponsor = re.findall(r'\n (.*):', t.previous)
        if sponsor:
            sponsoringUnit = sponsor[0]
            session = re.findall(r'Session (.*).', t.previous.previous)[0]
 
        else:
            sponsoringUnit = 'None'
            session = 'Unknown'
        
        dict_[index] = {'Year': year, 'SessionTitle': session, 'SponsoringUnit': sponsoringUnit}
        index += 1
        
    return dict_, index

In [12]:
def findText(soup, findType, attribute): #attribute as dict. Get text base on findType and attribute
    text = soup.findAll(findType, attrs = attribute)

    return text

In [13]:
def getAbstractInfo(soup): #for file type 1. Get name for presentation/abstract
    abstractInfo = soup.findAll('h3')
    if abstractInfo:
        for i in abstractInfo:
            if i.next == 'Abstract: ':
                sessionTitle = i.next.next.text
            else: 
                sessionTitle = "Not found"
    else:
        sessionTitle = "Not found"
    return sessionTitle

In [14]:
def getPresenterInfo(soup, year, dict_, index, sessionIndex): #get info from presenter and session, page type 1
    names = [] #for all names found
    #authors = [] #for saving authors
    authorCount = 0
    title = getSessionTitle(soup) #get title of session
    presenterInfo = findText(soup, 'span', {'class': 'largernormal'}) #get presenterInfo
    sessionTitle = getAbstractInfo(soup)
    sponsoringUnit = getSponsoringUnit(soup)
    for i in presenterInfo:
        name = i.text
        author = re.findall(r'^\n(.*)\n', name)[0]
        if author:
            authorCount += 1
            uniName = re.findall(r'\((.*)\)', name)
            if not uniName:
                university = 'Unknown'
            else:
                university = uniName[0]
            #authors.append(author)
            
            dict_[index] = {'Year': year, 'Name': author, 'University': university, 'AuthorIndex': authorCount, 
                            'Title': title, 'SponsoringUnit': sponsoringUnit, 'SessionTitle': sessionTitle, 'SessionIndex': sessionIndex}
            index += 1
        
    return dict_, index

In [15]:
def splitName(nameInfo, splitBy): #splits names into name and university
    nameSplit = nameInfo.split(splitBy)
    name = nameSplit[0]
    if len(nameSplit) == 2: #if name and university is written
        university = nameSplit[1]
        university = re.sub(r' \xa0', r'', university)
        university = re.sub(r' \)', r'', university)
    else:
        university = 'Unknown'
    
    return name, university

In [16]:
def checkString(string, regex):
    if re.findall(regex, string):
        return True
    else:
        return False

In [17]:
def checkNames(nameList): #check if string is a name
    names = []
    for n in nameList:
        name = n.get_text()
        if name != '' and name != 'both' and name != 'smaller':
            if not checkString(name, r'session') and not checkString(name, r'Room') and not checkString(name, r'\[.*\]') and not checkString(name, r'\n\n'):
                names.append(n)
    return names

In [18]:
def checkTitle(titleString): #check if string is title, and return title and whether it's a title or not
    title = re.findall(r'\[.*\] (.*)', titleString)
    if title: #check if there is a title
        if title[0] != 'Break': #if title isn't break
            return True, title[0]
        else:
            return False, title
    else:
        return False, title

In [19]:
def checkUniversity(string): #check if string includes university (if it is inside pharanthesis)
    if re.findall('\(.*\)', string):
        return True
    else:
        return False

In [20]:
def getNameInfo(string): #return university and name from string (if university is inside pharanthesis)
    university = re.findall(r'\((.*)\)', string)[0]
    name = re.findall(r'(.*)\(', string)[0]
    name = re.sub(' ', '', name)

    return university, name

In [21]:
def getUni_and_Name(string):
    nameInfo = []
    names = re.split(r',\s*(?![^()]*\))', string)
    
    noUniversity = [] 
    universityList = []
    
    for i in names: #loop through all names
        if checkUniversity(i): #if name is together with a university
            university, name = getNameInfo(i) #get name and university
            nameInfo.append((name, university)) #add name and university to set
            
            universityList.extend(repeat(university, len(noUniversity))) #append university the number of times, a name don't have a university
            noUniInfo = list(zip(noUniversity, universityList)) #assign name without university to university
            nameInfo += noUniInfo #add name and university to list
            
            noUniversity = [] #reset names without university
            
        else:
            noUniversity.append(i)
            
    return nameInfo

In [22]:
def getSpeakers(fileList, path, yearRegex): #get invited speakers to dict(with their info). Page type 1
    speaker_dict = {}
    index = 0
    sessIndex = 0
    for file in fileList:
        soup = getSoupFromFile(path, file)
        year = getYear(yearRegex, file)
        speaker_dict, index  = speakerInfo(soup, year, speaker_dict, index)
        
    return speaker_dict

In [23]:
def getPresenters(fileList, path, yearRegex): #page type 1
    presenter_dict = {}
    index = 0
    sessIndex = 0
    for file in tqdm(fileList):
        soup = getSoupFromFile(path, file)
        year = getYear(yearRegex, file)
        presenter_dict, index = getPresenterInfo(soup, year, presenter_dict, index,  sessIndex)
        sessIndex += 1
        
    return presenter_dict

In [24]:
def getChairIndex1(soup, year, dict_, index):#for file type 1
    chairNames = findText(soup, 'td', {'align': None, 'valign': 'top'})
    for n in chairNames:
        name, university = splitName(n.text, ',')
        
        dict_[index] = {'Year': year, 'Name': name, 'University': university} #add year, name and university to dictionary
        index += 1
        
    return dict_, index

In [25]:
#def getChairIndex2(soup, year, dict_, index):#for file type 2
def getChairIndex2(soup, year, dict_, index):#for file type 2
    chairNames = re.findall(r'Chair: (.*)', str(soup))
    for n in chairNames:
        name, university = splitName(n, ',')
        dict_[index] = {'Year': year, 'Name': name, 'University': university}
        index += 1
        
    return dict_, index
 

In [26]:
def getChair(fileList1, fileList2, path1, path2, yearRegex1, yearRegex2): #get chairIndex for type 1 and 2 files
    chair_dict = {}
    index = 0
    for file in fileList1:
        soup = getSoupFromFile(path1, file)
        year = getYear(yearRegex1, file)
        chair_dict, index = getChairIndex1(soup, year, chair_dict, index)
    
    for file in fileList2:
        soup = getSoupFromFile(path2, file)
        year = getYear(yearRegex2, file)
        chair_dict, index = getChairIndex2(soup, year, chair_dict, index)
    
    return chair_dict

In [27]:
def getSessionType(info): #get session type
    sessionType = re.findall(r'<i>(.*) session', str(info))[0].lower()
    
    return sessionType

In [28]:
def getSponsoringUnit_pages(fileList, path, yearRegex): #get sponsoring unit for file type 2, from pages
    sponsoringDict = {}
    index = 0
    for file in tqdm(fileList): 
        soup = getSoupFromFile(path, file)
        year = getYear(yearRegex, file)
        speaker_dict, index  = getSponsoringDict(soup, year, sponsoringDict, index)
    
    return sponsoringDict

In [29]:
def getSession(fileList, path, yearRegex):
    sessionDict = {}
    index = 0
    sessIndex = 0
    for file in tqdm(fileList):
        soup = getSoupFromFile(path, file)
        year = getYear(yearRegex, file)
        sessionInfo = soup.findAll('i') #info about session
        title = soup.findAll('h2')
        sessTitle = soup.findAll('title')[0].text
        sessionName = re.findall(r'Session (.*) -', sessTitle, re.IGNORECASE)
        if not sessionName: #try another way:
            sessionName = re.findall(r'Session (.*),', sessTitle, re.IGNORECASE)
        
        if sessionName:
            sessionTitle = sessionName[0]

        for t in title:
            titleStatement, titles = checkTitle(t.text)
            if titleStatement:
                sessionType = getSessionType(sessionInfo) #session type
                sessionName = checkNames(sessionInfo[2:])#names are from 2. index. Check if it is a name. This is all names in the sessions
                for n in sessionName: #loop through names, one session at a time
                    names = getUni_and_Name(n.text)
                    for p in names: #loop through one person at
                        name = p[0]
                        university = p[1]  
                        sessionDict[index] = {'Year': year, 'Name': name, 'University': university,'Title': titles, 'SessionType': sessionType, 'SessionTitle': sessionTitle, 'SessionIndex': sessIndex}
                        index += 1
                    sessIndex += 1
    return sessionDict

In [30]:
def subCharacters(string): #subsitute characters
    subFrom = [r'{\\AA}', r'\\\'', r'\"{o}', r'\"o', r'\"O', '\"u', r'\"{u}', r'\"a', r'\"{a}', r'\"{A}', r'{\\o}', r'{\\O}', r'~{a}', r'\\v{z}', r'\'{a}', r'\\v{c}', r'\\v{s}', r'\'{c}', r'\\ü', r'\\c{c}', r'\'{e}', r'\\~{n}', r'{\\ä}', r'\\ö', r'{\\ss}', r'\\v{e}', r'\'I', r'\\v{r}', r'\\v{S}', r'\\\^{e}', r'\\r{A}', r'\\c{S}', r'\'{\\i}', r'{\\" o}', r'\\\^{o}', r'\'{o}', r'\\`{e}', r'{\'e}', r'\\o{}', r'\\v{Z}', r'\'c', r'{ü}', r'\\c{s}'] 
    subTo = [r'Å', r"'", r'ö', r'ö', r'Ö', r'ü', r'ü', r'ä', r'ä', r'Ä', r'ø', r'Ø', r'ã', r'ž', r'á', r'č', r'š', 'ć', r'ü', r'ç', r'é', r'ñ', r'ä', r'ö', r'ß', r'ě', r'í', r'ř', r'Š', r'ê', r'Å', r'Ş', r'í', r'ö', r'ô', r'ó', r'è', r'é', r'ø', r'Ž', r'ć', r'ü', r'ş']

    for i in range(0,len(subFrom)):
        string = re.sub(subFrom[i], subTo[i], string)
    
    return string

In [31]:
def removeString(stringList, list_): #remove several strings from a list
    for i in stringList:
        if i in list_:
            list_.remove(i)
    return list_

In [32]:
def makeSpace(string):
    string = re.sub(r"(\w)([A-Z])", r"\1 \2", string)
    
    return string

# Data Parsing

## Get Dictionaries

In [33]:
speakerDict = getSpeakers(speakers, speakerPath, r'Speakers(\d+)')

In [34]:
presenterDict = getPresenters(presenters, presenterPath, r'Presenter(\d+)')

100%|██████████████████████████████████| 172116/172116 [13:56<00:00, 205.79it/s]


In [35]:
chairDict = getChair(chair, pages, chairPath, pagesPath, r'Chair(\d+)', r'Pages(\d+)')

In [36]:
sessionNames = getSponsoringUnit_pages(pages, pagesPath, r'Pages(\d+)')

100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 23.66it/s]


In [37]:
sessionDict = getSession(sessions, sessionPath, r'Session(\d+)')

100%|███████████████████████████████████████| 4506/4506 [02:02<00:00, 36.65it/s]


## Get dataframes

In [38]:
df_speaker = pd.DataFrame.from_dict(speakerDict, orient = 'index') #invited speakers for file type 1

In [39]:
#substiute Non-English characters with the true letters
df_speaker['Name'] = df_speaker.Name.apply(subCharacters)
df_speaker['University'] = df_speaker.University.apply(subCharacters)

In [40]:
df_presenter = pd.DataFrame.from_dict(presenterDict, orient = 'index') #all presenters(including invited speakers) for file type 1

In [41]:
#substiute Non-English characters with the true letters
df_presenter['Name'] = df_presenter.Name.apply(subCharacters)
df_presenter['University'] = df_presenter.University.apply(subCharacters)

In [42]:
#drop duplicates
df_presenter = df_presenter.drop_duplicates(subset = ['Year', 'Name', 'Title', 'SessionTitle', 'SessionIndex'])

In [43]:
#df with sponsoring unit, sessionTitle and Year
sponsoringUnit = df_presenter[['Year', 'Name', 'SponsoringUnit', 'SessionTitle']]

Left join of presenters and invited speakers for file type 1. This gives a dataframe of all authors for presentations for the invited speakers. Sessions in the presenter dataframe is assigned to invited as sessionType, if it is in the invitedIndex, otherwise it is assigned as other. 

In [44]:
#left join of df_speaker and df_presenter to get session index of invited sessions, and to to get all invited speakers
invitedSpeakers = pd.merge(df_speaker, df_presenter[['Year', 'Name', 'AuthorIndex', 'SponsoringUnit', 'SessionTitle', 'SessionIndex']], on = ['Year', 'Name', 'SessionTitle'], how = 'left')
invited_sessionIndex = invitedSpeakers.SessionIndex.unique()

In [45]:
#Checking data
print("Number of invited sessions scraped:", len(df_speaker))

#number of invited speakers, which can't be found in df_presenter based on year and session title
print("Number of invited sessions with no SessionIndex:", len(invitedSpeakers[invitedSpeakers.SessionIndex.isna()])) 

#number of invited speakers found in df_presenter based on year and session title
print("Number of invited sessions found in df_presenter:", len(invitedSpeakers[~invitedSpeakers.SessionIndex.isna()].SessionIndex.unique()))

print("Sum:", len(invitedSpeakers[invitedSpeakers.SessionIndex.isna()]) + len(invitedSpeakers[~invitedSpeakers.SessionIndex.isna()].SessionIndex.unique()))

Number of invited sessions scraped: 17768
Number of invited sessions with no SessionIndex: 846
Number of invited sessions found in df_presenter: 16922
Sum: 17768


In [46]:
#Attendees(authors) is all from df_presenters, where session index is not in invited_sessionindex
attendees = df_presenter[~df_presenter.SessionIndex.isin(invitedSpeakers.SessionIndex)]

#set session type as attendee for everyone
attendees['SessionType'] = 'attendees'

In [47]:
#set session type as invited for everyone
invitedSpeakers['SessionType'] = 'invited'

In [48]:
#Merge attendees and invitedSpeakers
allParticipants = pd.concat([attendees, invitedSpeakers])

In [49]:
df_chair = pd.DataFrame.from_dict(chairDict, orient = 'index') #chair index for all years
df_chair = df_chair.rename(columns={'University': "Institution"}) #rename column

In [50]:
df_sessionOverview = pd.DataFrame.from_dict(sessionNames, orient = 'index') #overview of sessions for file type 2

In [51]:
df_session = pd.DataFrame.from_dict(sessionDict, orient = 'index') #all presenters(sessions) including invited speakers for file type 2

#make space between names
df_session['Name'] = df_session.Name.apply(makeSpace)

In [52]:
#author index is set to 0 as default for sessions before 2005
#df_session['AuthorIndex'] = 0 

In [53]:
#drop duplicates according to all attributes
df_session = df_session.drop_duplicates()

In [54]:
df_sessions = pd.merge(df_sessionOverview, df_session, on = ['SessionTitle', 'Year']) #merge overview and sessions

In [55]:
#df with all presenters(authors) including invited speakers(and authors on their articles)
all_presenters = pd.concat([allParticipants, df_sessions])

In [56]:
#set session type to 'presenter' if it is not invited
all_presenters.loc[all_presenters['SessionType'] != 'invited', 'SessionType'] = 'attendee'

In [57]:
#new session index
all_presenters['SessionIndex'] = np.where(all_presenters.SessionIndex.isnull(), all_presenters.SessionIndex + all_presenters.SessionIndex.max() + 1, all_presenters.SessionIndex)

In [58]:
all_presenters

Unnamed: 0,Year,Name,University,AuthorIndex,Title,SponsoringUnit,SessionTitle,SessionIndex,SessionType
0,17,Prakash Giri,University of Nebraska - Lincoln,1.0,Voltage-induced entropy change in complex oxid...,GMAG DMP DCOMP,S43.00007,0.0,attendee
1,17,Dhananjay Kumar,North Carolina Agricultural and Technical Stat...,2.0,Voltage-induced entropy change in complex oxid...,GMAG DMP DCOMP,S43.00007,0.0,attendee
2,17,Christian Binek,University of Nebraska - Lincoln,3.0,Voltage-induced entropy change in complex oxid...,GMAG DMP DCOMP,S43.00007,0.0,attendee
3,12,Chandler Benjamin,University of Wisconsin-Madison,1.0,Characterization of the Interfacial Adhesion f...,DPOLY,P47.00013,1.0,attendee
4,12,John Springmann,University of Wisconsin-Madison,2.0,Characterization of the Interfacial Adhesion f...,DPOLY,P47.00013,1.0,attendee
...,...,...,...,...,...,...,...,...,...
1765007,4,S.N.Nakamura,"Tohoku University, Sendai, Japan",,Calculations of Single Particle time correlati...,DCP,Z35,630163.0,attendee
1765008,4,K.Nagamine,"KEK-MSL, Tsukuba, Japan",,Calculations of Single Particle time correlati...,DCP,Z35,630163.0,attendee
1765009,4,N. Kawamura,"State University of New York at Albany, Albany NY",,Calculations of Single Particle time correlati...,DCP,Z35,630163.0,attendee
1765010,4,Paul Moffatt,"Department of Physics, University of Alberta, ...",,Calculations of Single Particle time correlati...,DCP,Z35,630164.0,attendee


In [59]:
#select columns
all_presenters = all_presenters[['Year', 'Name', 'University', 'AuthorIndex', 'Title', 'SponsoringUnit', 'SessionIndex', 'SessionType']]

all_presenters = all_presenters.rename(columns={'University': 'Institution', 'SponsoringUnit': 'Division'}) #rename column

Replace last 2 digits of year, to all 4 digits of year

In [60]:
yearDict = {94 : 1994, 95 : 1995, 96 : 1996, 97 : 1997, 98 : 1998, 99 : 1999, 0 : 2000, 1 : 2001, 2 : 2002, 3 : 2003, 4 : 20004, 4 : 2004, 5: 2005, 6: 2006, 7: 2007, 8: 2008, 9: 2009, 10: 2010, 11: 2011, 12: 2012, 13: 2013, 14: 2014, 15: 2015, 16: 2016, 17: 2017, 18: 2018, 19: 2019, 20: 2020, 21: 2021, 22: 2022, 23: 2023}
all_presenters = all_presenters.replace({'Year': yearDict}) #replace year

In [61]:
print("Number of authors:", len(all_presenters))
print("Number of unique authors:", len(all_presenters.Name.unique()))

Number of authors: 2418837
Number of unique authors: 229992


In [62]:
all_presenters.groupby(['Year']).count()

Unnamed: 0_level_0,Name,Institution,AuthorIndex,Title,Division,SessionIndex,SessionType
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1996,849,849,0,849,849,849,849
1997,160881,160881,0,160881,160881,160881,160881
1998,155331,155331,0,155331,155331,155331,155331
2000,169100,169100,0,169100,169100,169100,169100
2001,187386,187386,0,187386,187386,187386,187386
2002,231063,231063,0,231063,231063,231063,231063
2003,706649,706649,0,706649,706649,706649,706649
2004,153753,153753,0,153753,153753,153753,153753
2005,12677,12677,12677,12677,12677,12677,12677
2006,14176,14176,14176,14176,14176,14176,14176


Now all presenters are saved:

In [63]:
all_presenters.to_pickle("Files/Dataframes/researchers_all.pkl")