<a href="https://colab.research.google.com/github/AnshumanAryan24/AlphaPro-SemEval2025-Task8/blob/main/notebooks/AlphaProQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
import pandas as pd
import requests
import pickle
import datasets
import json
from together import Together

In [None]:
api_key = "YOUR_API_KEY"

In [None]:
client = Together(
  api_key=api_key
)

# Load Databases

In [None]:
with open('datasetDict.pkl', 'rb') as file:
    datasetDict = pickle.load(file)

In [None]:
qaDataset = datasets.load_from_disk('./qaDataset')

In [None]:
ques = qaDataset['question']
datasets = qaDataset['dataset']
expAns = qaDataset['answer']

# Prompt Constants

In [None]:
rewrite_prompt = """You will be provided with two pieces of information. The first being a question and the second being the column names along with data types of a dataset. Your objective is twofold, the first to predict the datatype of the answer and second to paraphrase the question aptly such that the next person could generate the python code to required to answer the question while keeping the answer type the same as the given question. You are provided a two examples below.
Remember to not change what the original question is actually asking.

Important Notes:
Do not use markdown. Ever.
Do not leave additional line spacing
Just follow the template. Do not give anything else.

Few Shot Examples:
Question: Is the person with the highest net worth self-made?
Dataset Name: 001_Forbes
Dataset Table Schema: selfMade (bool), finalWorth (int64), city (string), title (string), gender (string), age (float64), rank (int64), philanthropyScore (float64), category (string), source (string), country (string)
Answer Type: bool
Paraphrased Question: Does the billionaire with the maximum final worth have self made attribute set to True?

Question: Did any children below the age of 18 survive?
Dataset Name: 002_Titanic
Dataset Table Schema: Age (float64), Siblings_Spouses Aboard (int64), Sex (string), Name (string), Pclass (int64), Fare (float64), Survived (bool)
Answer Type: bool
Paraphrased Question: Were there any survivors aged under 18?

The answers types are only of type: [bool, float64, int64, string, list of (type)]

Instruction for you to perform:
"""

codegen_prompt = '''You will be provided four pieces of information all of which are provided in the means of strings.
1. Dataset name:
2. Dataset Table Schema:
3. Question:
4. Expected Answer Type:

Your objective is to create a python code to answer the question given the dataset schema. Here is the function you will be needing to complete:
def answer_question(db, datasetTableSchema, question, expectedAnswerType):
	answer = (Here you generate the code which is needed to find the answer)
	return answer

Assume that the pandas library has been imported as pd.
Your answer should only contain the function definition. Assume that the dataset schema (containing column names and their datatypes in paranthesis) given is correct. The generated code should be correct. Do not attempt to change the dataset.
Your final answer data type should be one of the following categories:
1. Boolean: One of True or False.
2. Category: A string. For example - CEO, hello, drugstores.
3. Number: A numerical value. For example - 20, 23.3223, 414901.0.
4. list[category]: A list of strings. For example - ['India', 'Japan', 'China'], ['Ram', 'Shyam', 'Mohan']. Here, each entry should be enclosed within single quotes.
5. list[number]: A list of numbers. For example - [20.0, 30.4, 42.1], [171000, 129000, 111000, 107000, 106000, 91400].
When the question requests more than value, the expected answer type might be a list of strings or numbers. Ensure that lists are enclosed within square brackets.

Note:
Do not use markdown
Only complete the answer_question function. Assume all other codes outside of it is taken care of.
Generate the below code only for python

Few Shot Examples:
Example 1:
1. Dataset name: 001_Forbes
2. Dataset Table Schema: selfMade (bool), finalWorth (int64), city (string), title (string), gender (string), age (float64), rank (int64), philanthropyScore (float64), category (string), source (string), country (string)
3. Question: Does the individual with the highest final worth value have the selfMade attribute set to True?
4. Expected Answer Type: bool

Answer:
def answer_question(dataset, datasetTableSchema, question, expectedAnswerType):
	max_worth_individual = dataset.loc[dataset["finalWorth"] == dataset["finalWorth"].max()]
	is_self_made = max_worth_individual["selfMade"].bool()

	return is_self_made

Now, complete the following:'''

# Auxiliary Functions

In [None]:
def extractFunctionFromString(function_str:str):
    '''
    Take a string containing a function named 'answer_question' and return the function in scope.
    The function will have access to local and global variables.
    '''
    namespace = {**globals(), **locals()}
    exec(function_str, namespace)
    return namespace['answer_question']  # 'answer_question' is the default name in the code string

In [None]:
def getDatasetSchema(df:pd.DataFrame) -> list[str]:
  '''
  Get the dataset schema from the pandas.DataFrame object.
  List entry is - column name (column data type)
  '''
  schema = df.dtypes
  schema_string = ""
  for col, dtype in schema.items():
      if dtype == "bool":
          dtype_name = "bool"
      elif dtype == "int64":
          dtype_name = "int64"
      elif dtype == "double":
          dtype_name = "float64"
      elif dtype == "object":
          dtype_name = "string"
      else:
          dtype_name = dtype.name

      schema_string += f"{col} ({dtype_name}), "
  # Remove the trailing comma and space
  schema_string = schema_string.rstrip(", ")
  return schema_string.split(", ")

# Question Transformation Function

In [None]:
def processQuestion(model_name, temperature, question: str, dataset_name: str, schema: str, api_key: str) -> tuple:
    '''
    Process the question and return predicted answer type and paraphrased question, in that order.
    Parameter 'schema' is a comma-separated list of strings - column name (column data type).
    '''
    # Prepare prompt
    prompt = rewrite_prompt + f'''
    Question: {question}
    Dataset: {dataset_name}
    Dataset Table Schema: {schema}
    '''

    for attempt in range(3):
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}]
        )
        content = response.choices[0].message.content
        output = content.split('\n')
        if output[0].startswith('A'):
            answer_type = output[0][13:].strip()
            paraphrased_question = output[1][22:].strip()
        else:
            answer_type = output[1].strip()
            paraphrased_question = ''
        return answer_type, paraphrased_question


# Generate Code Function

In [None]:
def generateCode(model_name, temperature, question: str, metaData: dict, api_key: str) -> str:
    '''
    Generate code string for answering the paraphrased question.
    Parameter 'metaData' dictionary:
      'dataset_name': str,
      'columns': list[str],
      'answer_type': str
    '''
    prompt = codegen_prompt + f'''
    1. Dataset name: {metaData['dataset_name']}
    2. Dataset Table Schema: {', '.join(metaData['columns'])}
    3. Question: {question}
    4. Expected Answer Type: {metaData['answer_type']}
    '''


    for attempt in range(3):
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}]
        )
        text = response.choices[0].message.content.strip("```").lstrip("python\n")
        return text


# Generate Answer Function

In [None]:
def getAnswer(model_name, temperature, question:str, datasetMetaData:dict) -> str:
  '''
  Get final answer along with state information given a question in natural language and a dataset.
  Parameter 'datasetMetaData' dictionary:
    'dataset': pandas.DataFrame,
    'dataset_name': str

  Output dictionary:
    'original_question' : str,
    'rewritten_question' : str,
    'code' : str,  (code string)
    'answer_type' : str,  (predicted answer type in a string)
    'output' : Any  (actual answer - data type of this object depends on the code generated and the question, but the data type of the entity this represents is indicated in 'answer_type' entry)
  '''
  newQuestion = ''
  codeString = ''
  answerType = None
  output = '-'

  # try:
  columns = getDatasetSchema(datasetMetaData['dataset'])
  datasetMetaData['columns'] = columns

  answerType, newQuestion = processQuestion(model_name, temperature, question, datasetMetaData['dataset_name'], str(columns)[1:-1], api_key=api_key)
  datasetMetaData['answer_type'] = answerType

  codeString = generateCode(model_name, temperature, newQuestion, datasetMetaData, api_key=api_key)
  function = extractFunctionFromString(codeString)

  output = function(datasetMetaData['dataset'], columns, newQuestion, datasetMetaData['answer_type'])
  del function

  # except Exception as e:
  #   print('Exception ')
  #   print(f"Error: {e}\nQuestion: {question}")
  #   output = '-'

  return {
      'original_question' : question,
      'rewritten_question' : newQuestion,
      'code' : codeString,
      'answer_type' : answerType,
      'output' : output
  }

# Generate Results

In [None]:
# Initialize results dictionary with correct keys
results = {
    'Question': [],
    'Rewritten Question': [],
    'Code': [],
    'Answer Type': [],  # Matched to response key
    'Generated Answer': [],
    'Expected Answer': []
}

MODEL_NAME = "deepseek-ai/DeepSeek-V3"
TEMPERATURE = 0.5

start = 305
end = 600

for i in range(start, end):
    try:
        res = getAnswer(
            question=ques[i],
            datasetMetaData={
                'dataset': datasetDict[datasets[i]],
                'dataset_name': datasets[i]
            },
            model_name=MODEL_NAME,
            temperature=TEMPERATURE
        )

        # Append results with correct keys
        results['Question'].append(ques[i])
        results['Rewritten Question'].append(res.get('rewritten_question', ''))
        results['Code'].append(res.get('code', ''))
        results['Answer Type'].append(res.get('answer_type', ''))
        results['Generated Answer'].append(res.get('output', '-'))
        results['Expected Answer'].append(expAns[i])

        print(f'Finished row {i}')

    except Exception as e:
        print(f"Error processing question {i}: {str(e)}")
        # Append empty values on error
        results['Question'].append(ques[i])
        results['Rewritten Question'].append('')
        results['Code'].append('')
        results['Answer Type'].append('')
        results['Generated Answer'].append('ERROR')
        results['Expected Answer'].append(expAns[i])

# Results Storage

In [None]:
resultTable = pd.DataFrame(results)

In [None]:
resultTable.to_csv(f'DeepSeek-V3-{start}-{end - 1}.csv', index=False)

In [None]:
sample_index = 99
print(qaDataset[sample_index])