In [2]:
import pandas as pd
import numpy as np


# DATA IMPORT

# your working directory for the code files
import os
cwd = os.getcwd()
path = cwd + '\\Data'


#Set file paths
links_path = path + '\\projectPeopleLinks.csv'
people_path = path + '\\people.csv'
proj_path = path + '\\projects.csv'

#Read in the files
dfLinks = pd.read_csv(links_path)
dfPeople = pd.read_csv(people_path)
dfProj = pd.read_csv(proj_path)


# We also need orgProjectTexts:
path_int = cwd + '\\Intermediate_Files'
orgProjectTexts_path = path_int + '\\orgProjectTexts.csv'
orgProjectTexts = pd.read_csv(orgProjectTexts_path)

In [3]:
def proj_people_links(df_links, df_people, df_proj):
    # join people names
    projectPeopleLinks = pd.merge(df_links, df_people, left_on=['personuuid'], right_on = ['personuuid'], how = 'left')
    
    # join project names and info
    projectPeopleLinks = pd.merge(projectPeopleLinks, df_proj, left_on=['projectuuid'], right_on = ['projectuuid'], how = 'left')
    
    # combine the names columns and fill NAs
    projectPeopleLinks['person_name'] = projectPeopleLinks[['firstname', 'othernames', 'surname']].stack().groupby(level=0).agg(' '.join)

    # select columns to keep
    projectPeopleLinks = projectPeopleLinks[['projectuuid', 'personuuid', 'title', 'role', 'person_name']].sort_values('title')
    
    return projectPeopleLinks

In [5]:
def researcher_base(people_links, proj_set=orgProjectTexts):
    # remove rows where the UKRI dataset does not provide the proper name
    people_links = people_links[people_links['person_name'] != "Unknown Unknown Unknown"]
    
    # remove rows where the project title has not been announced or recorded
    people_links = people_links[people_links['title'] != "TBC"] 

    # reduce to only those titles in projectPeopleLinks which show up in orgProjectTexts
    people_links_red = people_links[people_links['title'].isin(proj_set['title'])]
    
    # remove duplicates based on title and person_name
    people_links_red = people_links_red.drop_duplicates(subset=["title", "person_name"])
    
    # the "Grants Team" is not a researcher recognised by Google Scholar
    people_links_red = people_links_red[people_links_red['person_name'] != "Grants Team"]
    
    # count number of researchers on project
    people_links_red['people_count'] = people_links_red.groupby('title')['title'].transform('count')
    
    # prepare for Google Scholar title/abstract import
    projectPeopleTexts = people_links_red.sort_values(['people_count', 'title'], ascending=[False, True])

    # restrict to projects with 3+ researchers to match texts for
    projectPeopleTexts_red = projectPeopleTexts[projectPeopleTexts['people_count'] >= 3]
    
    return projectPeopleTexts_red

In [None]:
projectPeopleLinks = proj_people_links(dfLinks, dfPeople, dfProj)

projectPeopleTexts_red = researcher_base(projectPeopleLinks)

In [None]:
#Google Scholar:
#If we want to replace the project titles with their research output:
#Use projectPeopleLinks (perhaps use ProjectPeopleLinks-named.csv instead): first reduce to only those projectuuid in projectPeopleLinks which show up in orgProjectTexts (in projectuuid column). Then fill in the Google Scholar output as a separate column of this (the texts of titles just merged together).
#Then, for each projectuuid in orgProjectTexts, you could look that up in projectPeopleLinks and append the row. Then de-duplicate (with ```drop_duplicates(subset=["orguuid", "projectuuid"])```).

#Note: for the sake of Google Scholar searching it may be most effective to actually remove the middle-names entirely

In [7]:
# for use if search_after_banned=True:
def new_proxy():
    while True:
        proxy = FreeProxy(rand=True, timeout=1).get()
        proxy_works = scholarly.use_proxy(http=proxy, https=proxy)
        if proxy_works:
            break
    print(proxy)
    return proxy

# collect titles from GoogleScholar, until you get reCAPTCHA'd
def collect_research(AuthorList, min_authors=2, search_after_banned=False):
    from scholarly import scholarly
    
    frames = []
    
    if search_after_banned==False:
        for Author in AuthorList:
            search_query = scholarly.search_author(Author)
            try:
                # query author on Google Scholar
                author = next(search_query).fill()

                # collect author's publications into Pandas dataframe
                df = pd.DataFrame([x.__dict__ for x in author.publications])

                # append author's name to distinguish their publications
                df['author'] = Author

                frames.append(df.copy())
            except StopIteration:
                # avoid error and move on if author is not looked up properly by Scholarly
                pass
    
    elif search_after_banned==True:
        new_proxy()

        while True:
            try:
                for Author in AuthorList:
                    search_query = scholarly.search_author(Author)
                    author = next(search_query).fill()
                    # creating DataFrame with authors
                    df = pd.DataFrame([x.__dict__ for x in author.publications])
                    df['author'] = Author
                    frames.append(df.copy())
                
                # you will be waiting a long time! Nice to know it's finally worked
                print("Got the results of the query")
                break
                
            except Exception as e:
                set_new_proxy() 
        
    # joining all author DataFrames
    df = pd.concat(frames, axis=0)

    # unpack bib into columns
    df2 = pd.concat([df.drop(['bib'], axis=1), df['bib'].apply(pd.Series)], axis=1)

    # counting unique authors attached to each title
    df2['Titlematches'] = df2['title'].str.lower()
    df_authors = df2.groupby('Titlematches').author.nunique()

    output = df_authors[df_authors >= min_authors].index
    
    return output

In [8]:
AuthorList = ['Ross McMaster', 'Nicholas Alexander Monk', 'Julia Margaret Rees', 'William Zimmerman', 'Robert K Poole', 'J Green']
author_output = collect_research(AuthorList)

In [None]:
#WARNING: the Scholarly API's title-based search gets caught by Google *really* quickly (as of 16 August 2020)

def abstract_search(TitleList):
    frames=[]

    for Title in TitleList:
        search_query = scholarly.search_pubs(Title)
        pub_info = next(search_query)

        df = pd.DataFrame([x.__dict__ for x in pub_info.bib])
        df['title'] = Title
        frames.append(df.copy())

    # joining all title DataFrames
    df = pd.concat(frames, axis=0)