In [1]:
import csv
import itertools
import requests
import json
import os
import time
from tqdm import tqdm
# Define the function for GPT-3.5 Turbo API calls with retry logic
def gpt_pairwise_comparison_with_retry(prompt, accepted_answers, api_key='api key'):
    # Define the API endpoint for GPT-3.5 Turbo
    global a,b,c
    api_url = "https://api.openai.com/v1/chat/completions"

    # Set up the headers with your OpenAI API key
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Define the data payload for the API request
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user",
                      "content": prompt}],
        "max_tokens": 20  # Adjust as needed
    }

    attempts = 0
    max_attempts = 10  # Maximum number of attempts
    while attempts < max_attempts:
        try:
            response = requests.post(api_url, headers=headers, json=data, timeout=60)  # 60-second timeout
            if response.status_code == 200:
                response = response.json()['choices'][0]['message']['content'].strip(' .,\'').lower()
                if response in accepted_answers[0] or response in accepted_answers[1]:
                    return response
                else:
                    print(attempts, response, accepted_answers[0], accepted_answers[1])
                    attempts += 1
                    time.sleep(5)
            else:
                attempts += 1
                print('Status code {}'.format(response.status_code))
                time.sleep(60 if str(response.status_code) == '429' else 5)  # Wait for 5 seconds before retrying
        except requests.exceptions.Timeout as e:
            attempts += 1  # Retry if the request times out
            time.sleep(5)
            print(attempts, e)
        except requests.exceptions.RequestException as e:
            attempts += 1
            time.sleep(5)  # Wait for 5 seconds before retrying
            print(attempts, e)
    print(prompt)
    raise Exception('max_attempts exceeded')
    return None  # Return None if all attempts fail

In [None]:
# Function to perform pairwise comparison for areas
def compare_disciplines(abstract, disciplines, base_folder):
    discipline_scores = {discipline: 0 for discipline in disciplines}
    for discipline1, discipline2 in itertools.combinations(disciplines, 2):
        prompt_file = f"{discipline1}_vs_{discipline2}.txt"
        prompt_path = os.path.join(base_folder, prompt_file)
        if os.path.exists(prompt_path):
            with open(prompt_path, 'r') as file:
                prompt = file.read().format(abstract)
                selected_discipline = gpt_pairwise_comparison_with_retry(prompt, accepted_answers=(discipline1, discipline2))
                if selected_discipline in discipline1:
                    discipline_scores[discipline1] += 1
                elif selected_discipline in discipline2:
                    discipline_scores[discipline2] += 1
    return discipline_scores

In [None]:
# Function to perform pairwise comparison for disciplines
def compare_subdisciplines(abstract, discipline, subdisciplines, base_folder):
    subdiscipline_scores = {subdiscipline: 0 for subdiscipline in subdisciplines}
    discipline_folder = os.path.join(base_folder, discipline)
    for sub1, sub2 in itertools.combinations(subdisciplines, 2):
        prompt_file = f"{sub1}_vs_{sub2}.txt"
        prompt_path = os.path.join(discipline_folder, prompt_file)
        if os.path.exists(prompt_path):
            with open(prompt_path, 'r') as file:
                prompt = file.read().format(abstract)
                selected_subdiscipline = gpt_pairwise_comparison_with_retry(prompt, accepted_answers=(sub1, sub2))
                if selected_subdiscipline in sub1:
                    subdiscipline_scores[sub1] += 1
                elif selected_subdiscipline in sub2:
                    subdiscipline_scores[sub2] += 1
    return subdiscipline_scores

In [None]:
# Define the main processing function
def process_publications(test_file_path, base_prompt_folder, discipline_data, output_file_path):
    # Try to load existing results from the JSON file
    if os.path.exists(output_file_path):
        with open(output_file_path, 'r') as json_file:
            results_dict = json.load(json_file)
    else:
        results_dict = {}
    
    count = 0
    # Read the test file and process each publication
    with open(test_file_path, mode='r') as file:
        csv_reader = csv.DictReader(file)
        for row in tqdm(csv_reader, total=191):
            try:
                pubid = row['PUBID']

                # Skip processing if results already exist for this publication
                if pubid in results_dict:
                    continue
                count += 1
                abstract = row['ABSTRACT']
                disciplines = list(discipline_data.keys())
                discipline_scores = compare_disciplines(abstract, disciplines, base_prompt_folder)

                # Compare disciplines for selected areas
                for discipline, score in discipline_scores.items():
                    if score >= 4 and discipline in discipline_data:
                        subdisciplines = list(discipline_data[discipline].keys())
                        subdiscipline_scores = compare_subdisciplines(abstract, discipline, subdisciplines, base_prompt_folder)
                        discipline_scores[discipline] = {'score': score, 'subdisciplines': subdiscipline_scores}
                    else:
                        discipline_scores[discipline] = {'score': score, 'subdisciplines': {}}

                # Update results
                results_dict[pubid] = discipline_scores
                with open(output_file_path, 'w') as json_file:
                    json.dump(results_dict, json_file, indent=4)
            except Exception as e:
                print(e)
                continue
        print(count)

# Paths
test_file_path = 'H:/data/WoS_data/data_gpt10.csv' 
base_prompt_folder = 'prompts/pairwise_comparison' 
output_file_path = 'results/ChatGPT/intermediate_results/chatgpt_hierarchical_pairwise.json
discipline_data = json.load(open('discipline_structure.json', 'r'))

# Process the publications
process_publications(test_file_path, base_prompt_folder, discipline_data, output_file_path)