In [106]:
import pandas as pd
import os
os.getcwd()

from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')



HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')



import getpass
import os

def _get_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_get_env("OPENAI_API_KEY")
_get_env("LANGCHAIN_API_KEY")
_get_env("HUGGINGFACE_API_KEY")

os.environ["LANGCHAIN_TRACING_V2"] = "true"


from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()




In [90]:

from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4o-mini", 
             temperature=0.5, 
             max_tokens=1500
             )

In [91]:
llm.invoke("test")

AIMessage(content='Test received! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 8, 'total_tokens': 18}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_507c9469a1', 'finish_reason': 'stop', 'logprobs': None}, id='run-376e2d3a-ca2d-4dc8-9a10-5344e1f4c64d-0', usage_metadata={'input_tokens': 8, 'output_tokens': 10, 'total_tokens': 18})

In [92]:
def get_answer(question):
    prompt = f"""Please solve the following math problem:

{question}

Please give the answer in float or integer numerical form.
"""

    response = llm.invoke(prompt)
    return response.content.strip()

In [93]:
def reevaluate(question, llm_reasoning, numerical_answer, correct_answer=None):
    prompt = f"""Imagine you're a high school teacher grading exam questions. The following question was answered by a student.
    {question}
    This was the answer given by the student:
    {llm_reasoning},
    So this was the answer marked as final by our automatic answer detection system:
    {numerical_answer}"""
    
    if correct_answer is not None:
        prompt += f"\n    and this is the actual correct answer by the solutions book (which might sometimes contain errors):\n    {correct_answer}"
    
    prompt += """
    
    Please re-evaluate his answer and grade his exam, and tell whether the question was correct or not, and also whether our numerical_answer was read correctly. 
    In any case, please write "final answer" (the correct(ed) answer in numerical form, readable by the regex [:\s]*([+-]?\d*\.?\d+)) as your last line. Please use floating numbers, so NOT LaTeX notation, but floating numbers, or integers. 
    """
    
    response = llm.invoke(prompt)
    return response.content.strip()


  prompt += """


In [94]:
import re

def extract_numerical_answer(text):
    # Look for patterns like "Final answer: X" or "The answer is X" at the end of the text
    match = re.search(r'(?:final answer|the answer is)[:\s]*([+-]?\d*\.?\d+)', text, re.IGNORECASE)
    if match:
        return float(match.group(1))
    else:
        # If no clear final answer, look for the last number in the text
        numbers = re.findall(r'[+-]?\d*\.?\d+', text)
        return float(numbers[-1]) if numbers else None

In [95]:
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(max_workers=6)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [96]:
def process_row(row):
    problem_id = row['problem_id']
    problem_text = row['problem_text']
    
    # Use `.get()` to safely retrieve 'answer', returning `None` if it doesn't exist
    correct_answer = row.get("answer", None)

    llm_reasoning = get_answer(problem_text)

    numerical_answer = extract_numerical_answer(llm_reasoning)

    # Pass correct_answer to reevaluate, which can now handle None values
    reevaluation = reevaluate(problem_text, llm_reasoning, numerical_answer, correct_answer)

    numerical_answer = extract_numerical_answer(reevaluation)

    # Build the result dictionary, including correct_answer only if it's not None
    result = {
        'problem_id': problem_id,
        'problem_text': problem_text,
        'llm_reasoning': llm_reasoning,
        'reevaluation': reevaluation,
        'answer': numerical_answer,
    }

    if correct_answer is not None:
        result["correct_answer"] = correct_answer

    return result


In [97]:
# df_train = pd.read_csv('data/train.csv')
# df_train.head()

# rows = df_train.head().to_dict(orient='records')
# process_row(rows[0])

In [98]:
# ## UNCHECKED
# df_train_unchecked = pd.read_csv('data/train_unchecked.csv')
# df_train_unchecked.head()

# rows_unchecked = df_train_unchecked.head().to_dict(orient='records')
# process_row(rows_unchecked[0])

In [99]:
results = map_progress(pool, rows, process_row)
df_results = pd.DataFrame(results)
df_results

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,problem_id,problem_text,llm_reasoning,reevaluation,answer,correct_answer
0,2374,Find the value of the expression $\dfrac{17}{5...,To solve the expression \(\dfrac{17}{5} :\dfra...,The student's solution to the expression \(\df...,1.6,1.6
1,4723,"In a company of 30 people, 25 use the social n...","To solve the problem, we can use the principle...",Let's evaluate the student's answer step by st...,24.0,24.0
2,7135,The number of road traffic accidents (RTAs) in...,Let the number of road traffic accidents in th...,The student's answer is correct in terms of th...,32.0,32.0
3,5814,Find the value of the expression $\dfrac{2\str...,To solve the expression \n\n\[\n\dfrac{2^{-5} ...,The student's answer is correct. They correctl...,256.0,256.0
4,9237,A traveler from Moscow wants to visit four cit...,"To visit all four cities (Vladimir, Yaroslavl,...",The student's answer correctly analyzed the ro...,26.0,53.0


In [100]:
from scorer import score
# score(df_train.head(), df_results, 'problem_id')

In [101]:
def prepare_prompts_and_get_answers(df):
    rows = df.to_dict(orient='records')
    results = map_progress(pool, rows, process_row)
    return pd.DataFrame(results)


In [102]:
df_train_results = prepare_prompts_and_get_answers(df_train)
df_train_results.head()

  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,problem_id,problem_text,llm_reasoning,reevaluation,answer,correct_answer
0,2374,Find the value of the expression $\dfrac{17}{5...,To solve the expression \(\dfrac{17}{5} : \dfr...,The student provided a thorough and correct so...,1.6,1.6
1,4723,"In a company of 30 people, 25 use the social n...",Let's analyze the problem step by step using t...,Let's evaluate the student's answer step by st...,24.0,24.0
2,7135,The number of road traffic accidents (RTAs) in...,Let the number of road traffic accidents in th...,The student's answer is well-structured and co...,32.0,32.0
3,5814,Find the value of the expression $\dfrac{2\str...,To solve the expression \(\dfrac{2^{-5} \cdot ...,The student provided a thorough and correct so...,256.0,256.0
4,9237,A traveler from Moscow wants to visit four cit...,To find a combination of routes that allows th...,The student's answer contains several inaccura...,53.0,53.0


In [103]:
df_train_unchecked_results = prepare_prompts_and_get_answers(df_train_unchecked)
df_train_unchecked_results.head()

  0%|          | 0/300 [00:00<?, ?it/s]

Unnamed: 0,problem_id,problem_text,llm_reasoning,reevaluation,answer,correct_answer
0,9625,Find a four-digit number that is 14 times smal...,Let the four-digit number be \( N \). Accordin...,The student's approach to solving the problem ...,6076.0,1568.0
1,1064,Find the root of the equation $\log _{\frac{1}...,To solve the equation \n\n\[\n\log_{\frac{1}{2...,The student's solution is well-structured and ...,7.25,7.25
2,8511,The airplane's navigation system informs the p...,"To convert the altitude from feet to meters, w...",The student's answer is correct in its calcula...,11590.0,11590.0
3,9692,Each of the four inequalities in the left colu...,To solve the inequalities and match them with ...,Let's evaluate the student's responses to the ...,2143.0,2143.0
4,221,"The city budget amounts to 84 million rubles, ...",To find out how many million rubles were spent...,The student's answer is correct. They correctl...,37.8,37.8


In [104]:
score(df_train, df_train_results, 'problem_id')

0.86

In [105]:
score(df_train_unchecked, df_train_unchecked_results, 'problem_id')

0.8866666666666667

In [80]:
df_test = pd.read_csv('data/test.csv')

df_test_results = prepare_prompts_and_get_answers(df_test)

submission = df_test_results[['problem_id', 'answer']]
submission.to_csv('starter_notebook_submission.csv', index=False)

  0%|          | 0/100 [00:00<?, ?it/s]