In [1]:
import time
from tqdm import tqdm
import json
import tiktoken
import pandas as pd

In [2]:
def gpt_with_retry(prompt, api_key='api key'):
    # Define the API endpoint for GPT-3.5 Turbo
    api_url = "https://api.openai.com/v1/chat/completions"

    # Set up the headers with your OpenAI API key
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Define the data payload for the API request
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user",
                      "content": prompt}],
    }

    attempts = 0
    max_attempts = 10  # Maximum number of attempts
    while attempts < max_attempts:
        try:
            response = requests.post(api_url, headers=headers, json=data, timeout=180)  # 60-second timeout
            if response.status_code == 200:
                response = response.json()['choices'][0]['message']['content'].strip(' .,').lower()
                return response
            else:
                attempts += 1
                print('Status code {}'.format(response.status_code))
                time.sleep(60 if str(response.status_code) == '429' else 5)  # Wait for 5 seconds before retrying
        except requests.exceptions.Timeout as e:
            attempts += 1  # Retry if the request times out
            time.sleep(5)
            print(attempts, e)
        except requests.exceptions.RequestException as e:
            attempts += 1
            time.sleep(5)  # Wait for 5 seconds before retrying
            print(attempts, e)
    print(prompt)
    raise Exception('max_attempts exceeded')
    return None  # Return None if all attempts fail

In [3]:
test_data = pd.read_csv('H:/data/WoS_data/data_gpt50.csv', usecols=['PUBID', 'ABSTRACT', 'final_disciplines'], dtype=str)

In [4]:
test_dict = test_data.set_index('PUBID').ABSTRACT.to_dict()

In [5]:
prompt = open('prompts/non-hierarchical/description.txt', 'r').read()

In [6]:
disciplines = []
with open('discipline_structure.json', 'r') as f:
    data = json.load(f)
for _, value in data.items():
    disciplines.extend(list(value.keys()))

In [7]:
disciplines

['agriculture, forestry, and fisheries',
 'animal and dairy science',
 'veterinary science',
 'agricultural biotechnology',
 'other agricultural sciences',
 'civil engineering',
 'electrical engineering, electronic engineering, information engineering',
 'mechanical engineering',
 'chemical engineering',
 'materials engineering',
 'medical engineering',
 'environmental engineering',
 'environmental biotechnology',
 'industrial biotechnology',
 'nano-technology',
 'other engineering and technologies',
 'history',
 'archaeology',
 'languages and linguistics',
 'literature',
 'philosophy and ethics',
 'religion',
 'arts (arts, history of arts, performing arts, music)',
 'other humanities',
 'basic medicine',
 'clinical medicine',
 'health sciences',
 'health biotechnology',
 'other medical sciences',
 'mathematics',
 'computer and information sciences',
 'physical sciences',
 'chemical sciences',
 'earth and related environmental sciences',
 'biological sciences',
 'other natural sciences

In [8]:
import re
def check_result_structure(result):
    pattern = re.compile("^(([a-z,\-()\s]+):\s?[0-9]*\s?&?)+$")
    return bool(pattern.match(result))

In [9]:
check_result_structure('languages and linguistics: 85&social and economic geography: 80&history: 75&arts (arts, history of arts, performing arts, music): 70&education sciences: 65')

True

In [10]:
def get_check_results(result):
    check_result_structure(result)
    if all(any(item.split(':')[0].strip() in discipline and item.split(':')[1].strip().isnumeric() for discipline in disciplines) for item in result.split('&')):
        return {y: int(item.split(':')[1].strip()) for item in result.split('&') for y in disciplines if y == item.split(':')[0].strip()}
    return False

In [11]:
get_check_results('history:30&law:50&economics and business:60')

{'history': 30, 'law': 50, 'economics and business': 60}

In [44]:
import json
from tqdm import tqdm
manual_input = False # Change to True to allow manual correction of the gpt-3.5-turbo response
try:
    with open('results_subsample/chatgpt_results_non_hierarchical.json', 'r+') as f:
        results = json.load(f)
except FileNotFoundError:
    results = {}
for code, abstract in tqdm(test_dict.items()):
    if code in results:
        continue
    answer = gpt_with_retry(prompt.format(abstract))
    attempt = 0
    answer = get_check_results(answer)
    while not answer:
        answer = gpt_with_retry(prompt.format(abstract))
        temp_answer = answer
        answer = get_check_results(answer)
        attempt += 1
        if attempt >= 5 and not answer:  # Limit attempts to prevent infinite loops
            if manual_input:
                    print(f"Manual input required. Current answer: {temp_answer}. Possible answers: {disciplines}")
                    # Allow the user to input the correct format
                    corrected_answer = input("Please enter the corrected format: ")
                    answer = {x.split(':')[0].strip(): x.split(':')[1].strip() for x in corrected_answer.split('&') if x.split(':')[0].strip() in disciplines}
            else:
                print(f"Max attempts reached for {code}")
                break
    if answer:
        results[code] = answer
        with open('results_subsample/chatgpt_results_non_hierarchical.json', 'w+') as f:
            json.dump(results, f)
            

  0%|                                                                                         | 0/1582 [00:00<?, ?it/s]

Manual input required. Current answer: physical sciences:80&materials engineering:70&nanotechnology:60&engineering and technologies:50&medical engineering:40. Possible answers: ['agriculture, forestry, and fisheries', 'animal and dairy science', 'veterinary science', 'agricultural biotechnology', 'other agricultural sciences', 'civil engineering', 'electrical engineering, electronic engineering, information engineering', 'mechanical engineering', 'chemical engineering', 'materials engineering', 'medical engineering', 'environmental engineering', 'environmental biotechnology', 'industrial biotechnology', 'nano-technology', 'other engineering and technologies', 'history', 'archaeology', 'languages and linguistics', 'literature', 'philosophy and ethics', 'religion', 'arts (arts, history of arts, performing arts, music)', 'other humanities', 'basic medicine', 'clinical medicine', 'health sciences', 'health biotechnology', 'other medical sciences', 'mathematics', 'computer and information s

  9%|███████▏                                                                       | 145/1582 [00:31<05:17,  4.53it/s]

Manual input required. Current answer: economics and business: 75&social and economic geography: 65&statistics and probability: 60&computer and information sciences: 55&mathematics: 50. Possible answers: ['agriculture, forestry, and fisheries', 'animal and dairy science', 'veterinary science', 'agricultural biotechnology', 'other agricultural sciences', 'civil engineering', 'electrical engineering, electronic engineering, information engineering', 'mechanical engineering', 'chemical engineering', 'materials engineering', 'medical engineering', 'environmental engineering', 'environmental biotechnology', 'industrial biotechnology', 'nano-technology', 'other engineering and technologies', 'history', 'archaeology', 'languages and linguistics', 'literature', 'philosophy and ethics', 'religion', 'arts (arts, history of arts, performing arts, music)', 'other humanities', 'basic medicine', 'clinical medicine', 'health sciences', 'health biotechnology', 'other medical sciences', 'mathematics', 

 10%|███████▊                                                                       | 156/1582 [01:07<12:07,  1.96it/s]

Manual input required. Current answer: clinical medicine:100&pharmacology and pharmacy:90&basic medicine:80&medical engineering:70&health sciences:60. Possible answers: ['agriculture, forestry, and fisheries', 'animal and dairy science', 'veterinary science', 'agricultural biotechnology', 'other agricultural sciences', 'civil engineering', 'electrical engineering, electronic engineering, information engineering', 'mechanical engineering', 'chemical engineering', 'materials engineering', 'medical engineering', 'environmental engineering', 'environmental biotechnology', 'industrial biotechnology', 'nano-technology', 'other engineering and technologies', 'history', 'archaeology', 'languages and linguistics', 'literature', 'philosophy and ethics', 'religion', 'arts (arts, history of arts, performing arts, music)', 'other humanities', 'basic medicine', 'clinical medicine', 'health sciences', 'health biotechnology', 'other medical sciences', 'mathematics', 'computer and information sciences'

100%|██████████████████████████████████████████████████████████████████████████████| 1582/1582 [01:41<00:00, 15.55it/s]


In [41]:
get_check_results('earth and related environmental sciences:100&physical sciences:90&materials engineering:80&chemical engineering:70&medical engineering:60')

{'earth and related environmental sciences': 100,
 'physical sciences': 90,
 'materials engineering': 80,
 'chemical engineering': 70,
 'medical engineering': 60}