# Run Set/Subset queries

### Import packages

In [1]:
import openai
from tqdm import tqdm
tqdm.pandas()
import pandas as pd
from datetime import datetime
import time

from dotenv import load_dotenv

load_dotenv()

In [None]:
## Secrets management - Don't spill your API keys!
openai.api_key = os.getenv('openai.api_key')

### Define Functions

In [10]:
def get_correct_article(text):
    ## simple logic to make articles match their targets
    if text[0].lower() in ['a', 'e', 'i', 'o', 'u']:
        article_variant = "an"
    else:
        article_variant = "a"
    
    return article_variant

def ask_question(subset, superset):
    time.sleep(1)
    article_variant = get_correct_article(superset)
    set_subset = f"{subset} is {article_variant} {superset}?"
    boilerplate = f"Answer True or False, then explain your rationale, and provide a numeric confidence score as a percentage out of 100: {set_subset}"
    response_basic = openai.Completion.create(model="text-davinci-003", prompt=boilerplate, temperature=0, max_tokens=1000)
    
    return response_basic['choices'][0]['text']

### Load dataset

In [4]:
# Load raw Wikidata dump, without category filters
df = pd.read_csv(r"wikidata_sample_df.csv")

# Snip off columns we don't care about/won't use
df = df.drop(columns=['Unnamed: 0', 'class.type', 'item.type', 'itemLabel.xml:lang', 'itemLabel.type', 'classLabel.type','subclass.type', 'subclass.value', 'subclassLabel.xml:lang', 'subclassLabel.type', 'subclassLabel.value'])

In [6]:
# Specify which categories we'll do queries against
cats = ['human','album','village','film','river', 'business','musical group','literary work','mountain','television series']
df = df[df['classLabel.value'].isin(cats)]

In [8]:
# Check the characteristics of the resultant dataframe
len(df)

46463

In [9]:
df['classLabel.value'].value_counts()

human                35005
album                 2514
village               2072
film                  1872
river                 1195
business              1104
musical group          913
literary work          825
mountain               482
television series      481
Name: classLabel.value, dtype: int64

### Prep the dataframe for long-running queries
Without paying for enhanced speed, OpenAI rate-limits you to one API call per second. Over 46k records, that's 
a significant amount of time, and we don't want to have to re-run it.  
We used this technique to split the large dataframe (46k rows) into smaller dataframes of of 50 records each.
This lets us run incremental queries without fear of dropping a large number of rows if there's a timeout or other error.

In [20]:
df = df.sort_values(by='classLabel.value').reset_index(drop=True)

In [54]:
## Not particularly readable, but this splits the dataframe into chunks of 50 rows each
n = 50
list_df = [unfinished_df[i:i+n] for i in range(0,len(unfinished_df),n)]

In [55]:
# Check how many sub-DFs we've generated
len(list_df)

476

### Run queries

In [None]:
starttime = datetime.now()
print(f"Started at {starttime.strftime('%m/%d/%Y, %H:%M:%S')}")

for sub_df in tqdm(list_df):
    sub_df['set subset response'] = sub_df.apply(lambda x: ask_question(x['itemLabel.value'], x['classLabel.value']), axis=1)
    
stoptime = datetime.now()
print(f"Finished at {stoptime.strftime('%m/%d/%Y, %H:%M:%S')}")
print(f"Ran in {str(stoptime - starttime)}")

In [36]:
# Re-join the output
df = pd.concat(list_df)

### Save results

In [None]:
df.to_csv("set_subset_responses.csv", index=False)