In [1]:
!pip install openai
import openai
from tqdm import tqdm
import json
import pandas as pd
import requests



In [2]:
tqdm.pandas()

In [3]:
openai.api_key = "api key"
model_names = {'ChatGPT': 'gpt-3.5-turbo',
               'GPT-4': 'gpt-4',
               'InstructGPT': 'text-davinci-003'}
_MODEL = 'ChatGPT'
_MODEL_NAME = model_names[_MODEL]
_MAX_NUM_TOKENS = 4096
max_input_tokens = 3800
api_url = "https://api.openai.com/v1/chat/completions"

In [4]:
def chatgpt_annotate(prompt):


    # prompting the model API until we have an output in the desired dictionary format

    # first prompting the model API until it returns an output
    while True:
        try:
            response = openai.ChatCompletion.create(
                                model=_MODEL_NAME,
                                messages=[{"role": "user", "content": prompt}],
                            )
            break
        except Exception as e:
            # If an error occurs, this typically means that the API is overloaded
            # wait for 5 seconds and try again
            print(e)
            print("API overloaded, trying again in 5 seconds...")
            time.sleep(5)
            continue

    response = response["choices"][0]["message"]["content"]

    return response

In [5]:
import time
def gpt_pairwise_comparison_with_retry(prompt, api_key='api-key'):
    # Define the API endpoint for GPT-3.5 Turbo
    api_url = "https://api.openai.com/v1/chat/completions"

    # Set up the headers with your OpenAI API key
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Define the data payload for the API request
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user",
                      "content": prompt}],
    }

    attempts = 0
    max_attempts = 10  # Maximum number of attempts
    while attempts < max_attempts:
        try:
            response = requests.post(api_url, headers=headers, json=data, timeout=180)  # 60-second timeout
            if response.status_code == 200:
                response = response.json()['choices'][0]['message']['content'].strip(' .,').lower()
                return response
            else:
                attempts += 1
                print('Status code {}'.format(response.status_code))
                time.sleep(60 if str(response.status_code) == '429' else 5)  # Wait for 5 seconds before retrying
        except requests.exceptions.Timeout as e:
            attempts += 1  # Retry if the request times out
            time.sleep(5)
            print(attempts, e)
        except requests.exceptions.RequestException as e:
            attempts += 1
            time.sleep(5)  # Wait for 5 seconds before retrying
            print(attempts, e)
    print(prompt)
    raise Exception('max_attempts exceeded')
    return None  # Return None if all attempts fail

In [6]:
test_data = pd.read_csv('H:/data/WoS_data/data_gpt50.csv', usecols=['PUBID', 'ABSTRACT', 'final_disciplines'], dtype=str)

In [7]:
test_dict = test_data.set_index('PUBID').ABSTRACT.to_dict()

In [8]:
initial_prompt = open('prompts/hierarchical/level0pred_description.txt', 'r').read()

In [9]:
# To be done after the prompts are written with the structure lvl1: prompt
lvl1 = ['natural sciences', 'engineering and technology', 'agricultural sciences', 'medical and health sciences', 'social sciences', 'humanities']
lvl1_prompts = {key: open(f'prompts/hierarchical/description_title/{key}.txt', 'r').read() for key in lvl1}

In [10]:
lvl12 = json.load(open('discipline_structure.json', 'r'))

In [11]:
lvl12 = {key: list(values.keys()) for key, values in lvl12.items()}

In [12]:
lvl12

{'agricultural sciences': ['agriculture, forestry, and fisheries',
  'animal and dairy science',
  'veterinary science',
  'agricultural biotechnology',
  'other agricultural sciences'],
 'engineering and technology': ['civil engineering',
  'electrical engineering, electronic engineering, information engineering',
  'mechanical engineering',
  'chemical engineering',
  'materials engineering',
  'medical engineering',
  'environmental engineering',
  'environmental biotechnology',
  'industrial biotechnology',
  'nano-technology',
  'other engineering and technologies'],
 'humanities': ['history',
  'archaeology',
  'languages and linguistics',
  'literature',
  'philosophy and ethics',
  'religion',
  'arts (arts, history of arts, performing arts, music)',
  'other humanities'],
 'medical and health sciences': ['basic medicine',
  'clinical medicine',
  'health sciences',
  'health biotechnology',
  'other medical sciences'],
 'natural sciences': ['mathematics',
  'computer and infor

In [13]:
lvl1_prompts 

{'natural sciences': 'Given the following 7 classes: mathematics;computer and information sciences;physical sciences;chemical sciences;earth and related environmental sciences;biological sciences;other natural sciences \nThe classes description and example of abstracts: \nmathematics: Pure mathematics, Applied mathematics; Statistics and probability (this excludes applied statistics) \n Example abstract: Mean convergence theorems using hybrid methods to find common fixed points for noncommutative nonlinear mappings in Hilbert spaces;Estimating the scale parameter of an exponential distribution under progressive type II censoring;Estimating the scale parameter of an exponential distribution under progressive type II censoring \ncomputer and information sciences: Computer sciences, information science and bioinformatics (hardware development to be in Electrical Engineering, social aspect to be in Media and Communication) \n Example abstract: Hourly Work of 3D Microstructural Visualizatio

In [14]:
failure = {}

In [15]:
import re
def check_result_structure(result):
    pattern = re.compile("^(([a-z,\-()\s]+):\s?[0-9]*\s?&?)+$")
    return bool(pattern.match(result))
def get_check_results(level, answer, previous_level=''):
    if not check_result_structure(answer):
        return False
    values = [x for x in answer.split('&') if int(x.split(':')[1].strip()) > 50]
    if level == 0 and all(x.split(':')[0].strip() in lvl1 for x in values):
        return {item.split(':')[0].strip(): {'score': item.split(':')[1].strip(), 'disciplines': {}} for item in values}
    if level == 1 and all(any(x.split(':')[0].strip() in y for y in lvl12[previous_level]) for x in values):
        return {y: x.split(':')[1].strip() for x in values for y in lvl12[previous_level] if x.split(':')[0] in y}
    return False

In [16]:
get_check_results(1, 'medical engineering:100&materials engineering:90&chemical engineering:80&nano-technology:70&kitten engineering:60', previous_level='engineering and technology')

False

In [22]:
from tqdm import tqdm
import json

results_path = 'results_subsample/chatgpt_results_hierarchical2_description_title50.json'
manual_input = True  # Set to False to raise an error instead

# Load existing results or initialize an empty dictionary if the file doesn't exist
try:
    with open(results_path, 'r') as file:
        results = json.load(file)
except FileNotFoundError:
    results = {}

for code, abstract in tqdm(test_dict.items()):
    if code in results:
        continue

    try:
        prompt = initial_prompt.format(abstract)
        answer = gpt_pairwise_comparison_with_retry(prompt)
        attempts = 0
        answer_temp = answer
        # Check initial answer's structure and belonging
        answer = get_check_results(0, answer)
        while not answer and attempts < 5:
            answer = gpt_pairwise_comparison_with_retry(prompt)
            answer = get_check_results(0, answer)
            attempts += 1
        
        if attempts == 5:  # Skip to the next pair if the condition isn't met after 5 attempts
            if manual_input:
                    print(f"Manual input required. Current answer: {answer_temp}. Possible answers: {lvl1}")
                    # Allow the user to input the correct format
                    corrected_answer = input("Please enter the corrected format: ")
                    answer = get_check_results(0, corrected_answer)
            else:
                    continue


        # Assuming answer passed the checks, process it
        discipline_set = answer

        for ans in list(discipline_set.keys()):
            new_prompt = lvl1_prompts[ans].format(abstract)
            final_answer = gpt_pairwise_comparison_with_retry(new_prompt)
            final_attempts = 0
            final_answer_temp = final_answer
            final_answer = get_check_results(1, final_answer, ans)
            # Check final answer's structure and belonging
            while not final_answer and final_attempts < 10:
                final_answer = gpt_pairwise_comparison_with_retry(new_prompt)
                final_answer = get_check_results(1, final_answer, ans)
                final_attempts += 1
            
            if final_attempts < 10:
                discipline_set[ans]['disciplines'] = final_answer
            else:
                if manual_input:
                    print(f"Manual input required for {ans}. Current answer: {final_answer_temp}. Possible answers: {lvl12[ans]}")
                    # Allow the user to input the correct format
                    corrected_answer = input("Please enter the corrected format: ")
                    discipline_set[ans]['disciplines'] = {x.split(':')[0].strip(): x.split(':')[1].strip() for x in corrected_answer.split('&') if x.split(':')[0].strip() in lvl12[ans]}
                else:
                    print(f"Max attempts reached for {ans}.")
                    raise Exception('Max attempts reached for a discipline')  # If final answer doesn't meet criteria after max attempts

        results[code] = discipline_set  # Update results for this code-abstract pair

        # Save after processing each pair
        with open(results_path, 'w+') as f:
            json.dump(results, f, indent=4)

    except Exception as e:
        print(f"Error processing {code}: {e}")

  0%|                                                                                         | 0/1569 [00:00<?, ?it/s]

Manual input required. Current answer: social sciences:80&psychology:70&medical and health sciences:60&psychology:50&natural sciences:40. Possible answers: ['natural sciences', 'engineering and technology', 'agricultural sciences', 'medical and health sciences', 'social sciences', 'humanities']
Please enter the corrected format: social sciences:80&medical and health sciences:60&natural sciences:40


  0%|▎                                                                              | 6/1569 [00:44<3:13:18,  7.42s/it]

Manual input required. Current answer: law:80&social sciences:70&social sciences:60&psychology:40&humanities:30. Possible answers: ['natural sciences', 'engineering and technology', 'agricultural sciences', 'medical and health sciences', 'social sciences', 'humanities']
Please enter the corrected format: social sciences:70&humanities:30


100%|██████████████████████████████████████████████████████████████████████████████| 1569/1569 [01:15<00:00, 20.91it/s]


In [None]:
set(results.keys()).issubset(set(test_dict.keys()))

In [None]:
test_data[test_data.PUBID.isin(set(test_dict.keys()) - set(results.keys()))]

In [None]:
test_data

In [None]:
failure

In [None]:
check_result_structure('media and communications:80&other social sciences:70&economics and business:60')

In [None]:
lvl1_prompts['engineering and technology']

In [None]:
prompt

In [None]:
file_path = 'results_subsample/chatgpt_results_hierarchical2_description_abstract.json'

with open(file_path, 'r') as file:
    json_data = json.load(file)

In [None]:
analysis_result = {
    "type": type(json_data).__name__,
    "keys": list(json_data.keys()) if isinstance(json_data, dict) else "N/A"
}

In [None]:
analysis_result