# Screening

In [None]:
import os
import openai
import sys
import time
import pandas as pd
import json
import numpy as np

openai.api_type = "azure" # azure - as we're using the OpenAI models through an Azure subscription
openai.api_version = "ADD YOUR API VERSION HERE"

fileName = './OriginalSearchWithAbstracts.tsv'

api_base1 = "ADD YOUR API ENDPOINT HERE"
api_base2 = "ADD YOUR API ENDPOING HERE"
api_key1 = "ADD YOUR KEY HERE" 
api_key2 = "ADD YOUR KEY HERE"
engine1 = "ADD YOUR ENGINE NAME HERE"
engine2 = "ADD YOUR ENGINE NAME HERE"

dataFile = pd.read_csv(fileName, encoding='utf-8', sep='\t')

if not 'Response' in dataFile.columns:
    dataFile['NOT_about_artificial_intelligence'] = ''
    dataFile['no_focus_on_equity'] = ''
    dataFile['is_about_equity_and_ai'] = ''
    dataFile['Response'] = np.nan

systemPrompt = '''You extract data from the research information provided below into a JSON object of the shape provided. 
If the data is not in the text return "false" for that field. \nShape: 
{NOT_about_artificial_intelligence: boolean // This study does NOT look at AI issues including any system or tool or algorithm that can be considered to 
display human-like intelligence and is capable of operating autonomously across diverse tasks i.e.AI: Automated decision-making tools (ADMs); 
face or image recognition tools; large language models (LLMs) such as ChatGPT or Google Bard or Claude or Midjourney; machine learning (ML); 
artificial narrow intelligence (ANI) that operates near human levels of intelligence; artificial general intelligence (AGI) that operates at the same level as a human; 
and artificial super intelligence (ASI) that operates at levels that exceed human capability., \n
no_focus_on_equity: boolean // this study does not focus on equity within AI i.e. does not discuss topics such bias or algorithmic risk or inequality 
or sex and gender bias or disparity or diversity or ethnicity or accessibility issues etc. \n
is_about_equity_and_ai: boolean // this paper may report work about the impact of artificial intelligence or AI algorithms on equity or discrimination 
or sex and gender bias or inequalities or bias among humans}'''

def getResponse(which_one, systemPrompt, userPrompt, retries):
    if retries > 10:
        return ''
    
    # this is just a very simple load balancer - if you have two endpoints to play with
    engine = ''
    if which_one % 2 == 0:
        openai.api_base = api_base1
        openai.api_key = api_key1
        engine = engine1
    else:
        openai.api_base = api_base2
        openai.api_key = api_key2
        engine = engine2

    try:
        response = openai.ChatCompletion.create(
        engine = engine,
        messages = [{"role":"system","content":systemPrompt},{"role":"user","content":userPrompt}],
        temperature=0,
        #max_tokens=3000,
        top_p=0,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None)
    except openai.error.InvalidRequestError as e:
        if 'maximum context length' in str(e):
            userPrompt = userPrompt[:int(len(userPrompt) * 0.9)]
        time.sleep(5)
        response = getResponse(which_one, systemPrompt, userPrompt, retries + 1)

    return response

for n in range(len(dataFile)):
    print(n)

    if not pd.isna(dataFile.Response[n]):
        continue
    if dataFile.ABSTRACT[n] == '':
        continue
    promptContent = 'Title: ' + dataFile.TITLE[n] + '\nAbstract: ' + dataFile.ABSTRACT[n]

    response = getResponse(n, systemPrompt, promptContent, 0)
    if response == '':
        dataFile.iloc[n, dataFile.columns.get_loc('Response')] = 'TOO MANY ERRORS'
        continue

    resp = response['choices'][0]['message']['content']
    try:
        dataFile.iloc[n, dataFile.columns.get_loc('Response')] = resp
    except:
        dataFile.iloc[n, dataFile.columns.get_loc('Response')] = 'error'
    try:
        j = json.loads(response['choices'][0]['message']['content'])
        dataFile.iloc[n, dataFile.columns.get_loc('NOT_about_artificial_intelligence')] = j['NOT_about_artificial_intelligence']
        dataFile.iloc[n, dataFile.columns.get_loc('no_focus_on_equity')] = j['no_focus_on_equity']
        dataFile.iloc[n, dataFile.columns.get_loc('is_about_equity_and_ai')] = j['is_about_equity_and_ai']
    except:
        x = 2-1
    
    if n % 20 == 0:
        dataFile.to_csv(fileName, sep='\t', index=False)

dataFile.to_csv(fileName, sep='\t', index=False)
print('done')


# Automated mapping

In [None]:
# First, filter the original search results according to whether they are included or not and save the mapping file


# ONLY RUN ONCE OR OVERWRITE RESU!LTS


import pandas as pd

fileName = './OriginalSearchWithAbstracts.tsv'
mapFileName = './RecordsForMapping.tsv'

dataFile = pd.read_csv(fileName, encoding='utf-8', sep='\t')

print(len(dataFile))
if 'NOT_about_artificial_intelligence' in dataFile.columns:
    dataFile = dataFile[dataFile['is_about_equity_and_ai'] == True].reset_index(drop=True)
    dataFile = dataFile.drop(columns=['NOT_about_artificial_intelligence',
    'no_focus_on_equity','is_about_equity_and_ai','Response'])

dataFile.to_csv(mapFileName, sep='\t', index=False)
print('map length: ' + str(len(dataFile)))




In [14]:
# second map records. Experiment with max mapping tool

# imports and functions in this cell

import os
import openai
import sys
import time
import pandas as pd
import json
import numpy as np

openai.api_type = "azure" # azure - as we're using the OpenAI models through an Azure subscription
openai.api_version = "ADD YOUR API VERSION HERE"

fileName = './RecordsForMapping.tsv'

api_base1 = "ADD YOUR API ENDPOINT HERE"
api_base2 = "ADD YOUR API ENDPOING HERE"
api_key1 = "ADD YOUR KEY HERE" 
api_key2 = "ADD YOUR KEY HERE"
engine1 = "ADD YOUR ENGINE NAME HERE"
engine2 = "ADD YOUR ENGINE NAME HERE"

def getResponse(which_one, systemPrompt, promptContent, retries):
    if retries > 10:
        return 'TOO MANY ERRORS!'
    engine = ''
    if which_one % 2 == 0:
        openai.api_base = api_base2
        openai.api_key = api_key2
        engine = engine2
    else:
        openai.api_base = api_base2
        openai.api_key = api_key2
        engine = engine2

    try:
        response = openai.ChatCompletion.create(
        engine = engine,
        messages = [{"role":"system","content":systemPrompt},{"role":"user","content":promptContent}],
        temperature=0,
        #max_tokens=3000,
        top_p=0,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None)
    except Exception as e:
        response = e
        if 'The response was filtered' in str(e):
            return e
        if 'maximum context length' in str(e):
            promptContent = promptContent[:int(len(promptContent) * 0.9)]
        time.sleep(5)
        response = getResponse(which_one, systemPrompt, promptContent, retries + 1)

    return response

# the prompts used in the original map (December 2023)
fields = '''
{ \n
ageing: boolean // Is the area where the AI is being applied concerned with ageing? \n
astronomy_and_related: boolean // Is the area where the AI is being applied concerned with astronomy, space activities or ufology? \n
business_and_management: boolean // Is the area where the AI is being applied concerned with business and management including hiring staff or screening resumes and marketing? \n
children_young_people_wellbeing: boolean // Is the area where the AI is being applied concerned with the wellbeing of children and young people? \n
climate_solutions: boolean // Is the area where the AI is being applied concerned with solutions to the climate crisis? \n
criminal_justice: boolean // Is the area where the AI is being applied concerned with criminal justice including law enforcement and policing? \n
computer_science: boolean // Is the area where the AI is being applied concerned with AI to improve computers and technology including code writing and identifying bias? \n
cyber_security: boolean // Is the area where the AI is being applied concerned with cyber_security? \n
disability: boolean // Is the area where the AI is being applied concerned with disability? \n
environment: boolean // Is the area where the AI is being applied concerned with the environment including weather patterns and sustainability and land use? \n
education: boolean // Is the area where the AI is being applied concerned with education? \n
finance: boolean // Is the area where the AI is being applied concerned with finance including money laundering and trading and mortgage applications? \n
government: boolean // Is the area where the AI is being applied concerned with government or the supply of services or the military? \n
immigration: boolean // Is the area where the AI is being applied concerned with immigration? \n
health: boolean // Is the area where the AI is being applied concerned with health including medical imaging and mental health and workplace health and safety and public health and diagnostics? \n
information_knowledge_management: boolean // Is the area where the AI is being applied concerned with information and knowledge management such as libraries, or documenting and collating information? \n
international_development: boolean // Is the area where the AI is being applied concerned with international_development? \n
knowledge_translation_implementation: boolean // Is the area where the AI is being applied concerned with knowledge translation and implementation? \n
law: boolean // Is the area where the AI is being applied concerned with the law? \n
manufacturing: boolean // Is the area where the AI is being applied concerned with manufacturing? \n
media: boolean // Is the area where the AI is being applied concerned with the media including manipulated media and deep fakes and social media and video surveillance and writing and reporting? \n
professional_services: boolean // Is the area where the AI is being applied concerned with professional services including human resources and job seeking and hiring staff and resume screening and hospitality? \n
social_welfare: boolean // Is the area where the AI is being applied concerned with social welfare? \n
transport: boolean // Is the area where the AI is being applied concerned with transport including automotive and traffic and maritime? \n
utilities: boolean // Is the area where the AI is being applied concerned with utilities such as electricity and gas and water and telecommunications? \n
unclear_not_stated: boolean // Is the area where the AI is being applied unclear or not stated? \n
area_of_application: string // Describe the area that the AI is being employed in \n
ai_type_generative: boolean // does the abstract describe the use of generative AI where generative models are used to create new data? \n
ai_type_discriminative: boolean // does the text describe the discriminative use of AI where models are used to classify or predict data? \n
type_of_ai_description: string // state the type of AI that is described in the text \n
type_of_ai_ml: boolean // does the abstract describe the use of machine learning? \n
type_of_ai_llm: boolean // is the type of AI being used a large language model, such as as ChatGPT, Google Bard, Claude, Midjourney? \n
type_of_ai_chatbot: boolean // is the AI evaluated an LLM-based chatbot? \n
type_of_ai_transcriber: boolean // is the AI evaluated an LLM-based transcriber? \n
type_of_ai_database_search_engine: boolean // is the AI evaluated a LLM-based database search engine? \n
type_of_ai_translator: boolean // is the AI evaluated a LLM-based translator? \n
type_of_ai_summarizer: boolean // is the AI evaluated a LLM-based summarizer? \n
type_of_ai_content_generator: boolean // is the AI evaluated a LLM-based content generator? \n
type_of_ai_tutor: boolean // is the AI evaluated a LLM-based tutor? \n
type_of_ai_recommender_system: boolean // is the AI evaluated a LLM-based recommender system? \n
type_of_ai_llm_sentiment_analysis: boolean // does the AI evaluated conduct a LLM-based sentiment analysis? \n
type_of_ai_llm_qa: boolean // is the AI evaluated a LLM-based question-answering system:? \n
type_of_ai_llm_image_captioning: boolean // is the AI evaluated a LLM-based image captioning system? \n
type_of_ai_: string // state the type of AI system described \n
type_of_llm_ai: string // state the type of LLM-based AI but return false if no LLM-based AI system is described \n
type_of_ml_classifier: boolean // does the machine learning task described involve classification? \n
type_of_ml_clustering: boolean // does the machine learning task described involve clustering data? \n
type_of_ml_creative: boolean // is the text about the creation of creative content using AI including fakes? \n
type_of_ml_extraction: boolean // does the machine learning task described involve information extraction including relationship extraction? \n
type_of_ml_coding: boolean // does the machine learning task described involve writing computer code? \n
type_of_ml_question_answer: boolean // does the machine learning task described involve a question-answering system? \n
type_of_ml_recognition: boolean // does the machine learning task described involve recognition including face recognition? \n
type_of_ml_rewriting: boolean // does the machine learning task described involve rewriting existing text? \n
type_of_ml_searching: boolean // does the machine learning task described involve searching? \n
type_of_ml_summarising: boolean // does the machine learning task described involve summarising text? \n
type_of_ml_translating: boolean // does the machine learning task described involve translating text? \n
type_of_ml_anomaly: boolean // does the machine learning task described involve anomaly detection? \n
type_of_ml_optimizing: boolean // does the machine learning task described involve optimising? \n
type_of_ml_automating: boolean // does the machine learning task described involve automating work currently done by humans? \n
type_of_ml_recommending: boolean // does the machine learning task described involve making recommendations or personalisation? \n
equity_bias: boolean // is there is a discussion of bias but no indication about which area of equity is affected? \n
equity_inequities: boolean // does the article just discusses inequities in AI without any specificity about which ones? \n
equity_human_rights: boolean // does the article discuss issues of human rights in the context of AI? \n
equity_age: boolean // does the article discuss age in relation to equity and AI? \n
equity_sex: boolean // does the article discuss sex in relation to equity and AI? \n
equity_gender: boolean // does the article discuss gender in relation to equity and AI? \n
equity_race: boolean // does the article discuss race or ethnicity or ancestry in relation to equity and AI? \n
equity_religion: boolean // does the article discuss religion in relation to equity and AI? \n
equity_ses: boolean // does the article discuss socioeconomic status in relation to equity and AI? \n
equity_education: boolean // does the article discuss educational level in relation to equity and AI? \n
equity_sexual_orientation: boolean // does the article discuss sexual orientation in relation to equity and AI? \n
equity_location: boolean // does the article discuss the location where people live in relation to equity and AI? \n
equity_disability: boolean // does the article discuss disability in relation to equity and AI? \n
equity_homelessness: boolean // does the article discuss homelessness in relation to equity and AI? \n
equity_drug_alcohol: boolean // does the article discuss drug or alcohol dependence in relation to equity and AI? \n
equity_migrants: boolean // does the article discuss vulnerable migrants in relation to equity and AI? \n
equity_traveller: boolean // does the article discuss gypsy or Roma or traveller communities in relation to equity and AI? \n
equity_sex_workers: boolean // does the article discuss sex workers in relation to equity and AI? \n
equity_justice: boolean // does the article discuss people in contact with the justice system in relation to equity and AI? \n
equity_slavery: boolean // does the article discuss victims of modern slavery in relation to equity and AI? \n
equity_group: string // describe the marginalised group (if any) that is discussed in relation to equity \n
}'''

# the prompts used in the v2 map (April 2024)
fields = '''
{\n
coding_assistant: boolean // Is the paper focused on the use of AI for assisting with the writing of software code?\n
ageing: boolean // Is the area where the AI is being applied concerned with ageing?\n
astronomy_and_related: boolean // Is the area where the AI is being applied concerned with astronomy, space activities or ufology?\n
business_and_management: boolean // Is the area where the AI is being applied concerned with business and management including hiring staff or screening resumes and marketing?\n
children_young_people_wellbeing: boolean // Is the area where the AI is being applied concerned with the wellbeing of children and young people?\n
climate_solutions: boolean // Is the area where the AI is being applied concerned with solutions to the climate crisis?\n
criminal_justice: boolean // Is the area where the AI is being applied concerned with criminal justice including law enforcement and policing?\n
computer_science: boolean // Is the area where the AI is being applied concerned with AI to improve computers and technology including code writing and identifying bias?\n
cyber_security: boolean // Is the area where the AI is being applied concerned with cyber_security?\n
standards_development: boolean // Is the paper focused on the development of standards for the responsible use of AI?\n
disability: boolean // Is the area where the AI is being applied concerned with disability?\n
environment: boolean // Is the area where the AI is being applied concerned with the environment including weather patterns and sustainability and land use?\n
education: boolean // Is the area where the AI is being applied concerned with education?\n
finance: boolean // Is the area where the AI is being applied concerned with finance including money laundering and trading and mortgage applications?\n
government: boolean // Is the area where the AI is being applied concerned with government or the supply of services or the military?\n
immigration: boolean // Is the area where the AI is being applied concerned with immigration?\n
health: boolean // Is the area where the AI is being applied concerned with health including medical imaging and mental health and workplace health and safety and public health and diagnostics?\n
mitigating_bias: boolean // Is the study about the identification or the mitigation of bias in relation to AI and equity?\n
information_knowledge_management: boolean // Is the area where the AI is being applied concerned with information and knowledge management such as libraries, or documenting and collating information?\n
international_development: boolean // Is the area where the AI is being applied concerned with international_development?\n
knowledge_translation_implementation: boolean // Is the area where the AI is being applied concerned with knowledge translation and implementation?\n
law: boolean // Is the area where the AI is being applied concerned with the law?\n
manufacturing: boolean // Is the area where the AI is being applied concerned with manufacturing?\n
media: boolean // Is the area where the AI is being applied concerned with the media including manipulated media and deep fakes and social media and video surveillance and writing and reporting and recommender systems?\n
professional_services: boolean // Is the area where the AI is being applied concerned with professional services including human resources and job seeking and hiring staff and resume screening and hospitality?\n
social_welfare: boolean // Is the area where the AI is being applied concerned with social welfare?\n
utilities: boolean // Is the area where the AI is being applied concerned with utilities such as electricity and gas and water and telecommunications?\n
unclear_not_stated: boolean // Is the area where the AI is being applied unclear or not stated?\n
other_area_of_implementation: boolean // does the area of implementation fit outside the above categories?\n
area_of_application: string // Describe the area that the AI is being employed in\n
transport: boolean // Is the area where the AI is being applied concerned with transport including automotive and traffic and maritime?\n
ai_type_discriminative: boolean // does the text describe the discriminative use of AI where models are used to classify or predict data?\n
ai_type_generative: boolean // does the abstract describe the use of generative AI where generative models are used to create new data?\n
type_of_ai_unclear: boolean // It is unclear whether the type of AI is discriminative or generative\n
equity_bias: boolean // is there is a discussion of bias but no indication about which area of equity is affected?\n
equity_inequities: boolean // does the article just discusses inequities in AI without any specificity about which ones?\n
equity_human_rights: boolean // does the article discuss issues of human rights in the context of AI?\n
equity_age: boolean // does the article discuss age in relation to equity and AI?\n
equity_disability: boolean // does the article discuss disability in relation to equity and AI?\n
equity_drug_alcohol: boolean // does the article discuss drug or alcohol dependence in relation to equity and AI?\n
equity_education: boolean // does the article discuss educational level in relation to equity and AI?\n
equity_gender: boolean // does the article discuss gender in relation to equity and AI?\n
equity_traveller: boolean // does the article discuss gypsy or Roma or traveller communities in relation to equity and AI?\n
equity_location: boolean // does the article discuss the location where people live in relation to equity and AI?\n
equity_occupation: boolean // does the article discuss occupation or professions in relation to equity and AI?\n
equity_homelessness: boolean // does the article discuss homelessness in relation to equity and AI?\n
equity_justice: boolean // does the article discuss people in contact with the justice system in relation to equity and AI?\n
equity_place_of_residence: boolean // does the article discuss people's place of residence in relation to equity and AI?\n
equity_race: boolean // does the article discuss race or ethnicity or ancestry in relation to equity and AI?\n
equity_religion: boolean // does the article discuss religion in relation to equity and AI?\n
equity_sex: boolean // does the article discuss sex in relation to equity and AI?\n
equity_sex_workers: boolean // does the article discuss sex workers in relation to equity and AI?\n
equity_sexual_orientation: boolean // does the article discuss sexual orientation in relation to equity and AI?\n
equity_social_capital: boolean // does the article discuss social capital in relation to equity and AI?\n
equity_ses: boolean // does the article discuss socioeconomic status in relation to equity and AI?\n
equity_slavery: boolean // does the article discuss victims of modern slavery in relation to equity and AI?\n
equity_migrants: boolean // does the article discuss vulnerable migrants in relation to equity and AI?\n
equity_group: string // describe the marginalised group (if any) that is discussed in relation to equity\n
equity_other: boolean // an other marginalised, at-risk, socially excluded and/or inclusion health groups(s)\n
}'''

systemPrompt = '''You extract data from the research information provided below into a JSON object of the shape provided. 
If the data is not in the text return "false" for that field. \nShape: \n''' + fields



In [2]:
# add fields to the dataFile (if not yet present)

dataFile = pd.read_csv(fileName, encoding='utf-8', sep='\t')

if not 'Response' in dataFile.columns:
    f = fields.split('\n\n')

    dataFile['Response'] = np.nan

    for n in range (1, len(f) -1):
        newField = f[n][:f[n].index(":")]
        dataFile[newField] = ''

In [None]:
# run through the file and get the mapping data

no_abstract = 0

for n in range(len(dataFile)):
    print(n)

    if not pd.isna(dataFile.Response[n]):
        continue
    if dataFile.ABSTRACT[n] == '':
        no_abstract = no_abstract + 1
        continue
    promptContent = 'Title: ' + dataFile.TITLE[n] + '\nAbstract: ' + dataFile.ABSTRACT[n]
    response = getResponse(n, systemPrompt, promptContent, 0)
    if response == '':
        dataFile.iloc[n, dataFile.columns.get_loc('Response')] = 'TOO MANY ERRORS'
        continue

    try:
        resp = response['choices'][0]['message']['content']
    except:
        dataFile.iloc[n, dataFile.columns.get_loc('Response')] = response
        continue
    try:
        dataFile.iloc[n, dataFile.columns.get_loc('Response')] = resp
    except:
        dataFile.iloc[n, dataFile.columns.get_loc('Response')] = 'error'
    try:
        j = json.loads(response['choices'][0]['message']['content'])
        for f in j:
            dataFile.iloc[n, dataFile.columns.get_loc(f)] = j[f]
    except:
        x = 2-1
    
    if n % 10 == 0:
        dataFile.to_csv(fileName, sep='\t', index=False)

dataFile.to_csv(fileName, sep='\t', index=False)
