## ChatGPT API Institution Imputation

Get API key from OpenAI
https://openai.com/

To do this, either log in to or sign up for an OpenAI account.  Then, click your account icon in the upper right-hand corner and select "View API keys" from the drop down menu.  Any existing API keys will be displayed here.  If you do not have an API key, click the "+ Create new secret key" button to generate one. 

In [None]:
import openai
import pandas as pd
from tqdm import tqdm
import re
import time
from retrying import retry
from tqdm.auto import tqdm


# Load the first CSV file into a Pandas dataframe
df1 = pd.read_csv("./data/output/sample_data_Scrape_Result.csv")
print(f'Length of df1: {len(df1)}')

In [None]:
openai.api_key= input('Please enter your OpenAI API Key and press Enter:')
if openai.api_key != '':
    print(f'API Key Entered successfully, Last seven charaters: {openai.api_key[len(openai.api_key)-7:]}')
else:
    print('Enter valid API Key')
    exit()

In [None]:
# df1 = df1.drop(['Country_chatGPT', 'Lat_chatGPT', 'Long_chatGPT'], axis=1)

In [None]:
# The targets dataframe is defined for targeting specific Accessions of interest.
targets = pd.read_csv('./data/output/sample_data_Scrape_Result.csv')

In [None]:
targets.head(5)

In [None]:
# Filtering Dataframe rows
df1 = df1[(df1['Acc'].isin(targets['Acc']))]

In [None]:
df1.head(5)

In [None]:
df1[df1['Acc']=='SRR11788653']

In [None]:
print(f'Length of df1: {len(df1)}')

In [None]:
# Subset df1 for test length test_len
test_len = 539 # This number lets you subset your input targets.
df1 = df1.head(test_len)
print(f'Length of df1 for test: {len(df1)}')

In [None]:
%timeit
tqdm.pandas()
@retry(stop_max_attempt_number=3, wait_fixed=60000)  # retry up to 3 times, waiting 60 seconds between each retry
def get_institute_name(row):
    """
    -----------------------------------------------------------------------------------------------
    
    -----------------------------------------------------------------------------------------------
    """
    message_question='If "' +str(row['Center_Names']) +'", "' + str(row['Submitted_by']) + '", "' + str(row['Biosample_Submission']) + '", "' + str(row['Bioproject_Submission'])+'" represent only one institute and If institution keywords are "EBI" or "GEO" or "Gene Expression Omnibus" or "NCBI" or "National Center for Biotechnology Information (NCBI)" or "National Center for Biotechnology Information" or "NCBI (Gene Expression Omnibus)" or "NCBI (National Center for Biotechnology Information)" or "NCBI-GEO" or "EBI (European Bioinformatics Institute)" ignore them and choose another option for the one institution, what is it? Please just provide the name only, do not reply with a sentence nor an abbreviation.'
    message_history=[{"role":"user", "content":message_question}]

    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=message_history
    )        
    reply_content=completion.choices[0].message.content
    print(f'Query answer 1: {reply_content}')
    institution = reply_content
    """
    -----------------------------------------------------------------------------------------------
    
    -----------------------------------------------------------------------------------------------
    """
    reply_content = reply_content.replace("\n", "")

    message_history.append({"role":"assistant", "content": reply_content})
    message_history.append({"role":"user", "content": f"What country is the institution located? Please just provide the name of the country only, do not reply with a sentence nor an abbreviation."})

    completion=openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=message_history
                )  
    reply_content=completion.choices[0].message.content
    print(f'Query answer 2: {reply_content}')
    country = reply_content
    """
    -----------------------------------------------------------------------------------------------
    
    -----------------------------------------------------------------------------------------------
    """
    reply_content = reply_content.replace("\n", "")

    message_history.append({"role":"assistant", "content": reply_content})
    message_history.append({"role":"user", "content": f"What GPS Coordinate is the institution? Please just provide the Latitude and Longitude separated by comma only, do not reply with a sentence nor an abbreviation."})

    completion=openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=message_history
                )  
    reply_content=completion.choices[0].message.content
    print(f'Query answer 3: {reply_content}')
    LatLong = reply_content
    return (institution, country, LatLong)


In [None]:
import pandas as pd

# set the maximum number of rows to save in each partition
partition_size = 37

# initialize the results list and partition counter
results = []
partition_count = 1

# Print a message to warn the user that the following process may take awhile to complete
# The sample dataset seems to take at least 30 minutes
print('Note: The following process may take 30 minutes or longer to complete.')

for index, row in tqdm(df1.iterrows()):
    if index > 0 and index % 20 == 0:
        time.sleep(60)
        
    institute_res = get_institute_name(row)
    result = df1.loc[[index], :]
    result['Institute_chatGPT'] = institute_res[0]
    result['Country_chatGPT'] = institute_res[1]
    
    try:
        result['Lat_chatGPT'] = institute_res[2].split(',')[0]
        result['Long_chatGPT'] = institute_res[2].split(',')[1]
    except IndexError:
        # handle the IndexError by setting Lat_chatGPT and Long_chatGPT to None
        result['Lat_chatGPT'] = None
        result['Long_chatGPT'] = None
        
    results.append(result)
    
    # if the partition size is reached, save the results as a CSV file
    if len(results) == partition_size:
        filename = f'./data/output/chatgpt_results_{partition_count}.csv'
        pd.concat(results).to_csv(filename, index=False)
        print(f'Saved partition {partition_count}')
        
        # reset the results list and increment the partition counter
        results = []
        partition_count += 1

# save any remaining results as a CSV file
if len(results) > 0:
    filename = f'./data/output/chatgpt_results_{partition_count}.csv'
    pd.concat(results).to_csv(filename, index=False)
    print(f'Saved partition {partition_count}')