In [1]:
# credits:
# https://www.kaggle.com/code/olyatsimboy/aimo-openmath-mistral-baseline
# https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama
# https://www.kaggle.com/code/thedrcat/aimo-mixtral-baseline

# Zero-shot MMOS-DeepSeekMath-7B with self-consistency and generated code reasoning evaluation

Self-consistency is a modification of the standard greedy decoding in reasoning pipelines via sampling several diverse answers followed by aggregation, e.g., most common answer ([SC-CoT paper](https://arxiv.org/pdf/2203.11171.pdf)).

In this kernel, we will consider MMOS-DeepSeekMath-7B RL-tuned backbone; in my experiments, this model produces more consistent code reasoning and the code block execution will allow us to decrease arithmetic hallucinations.

In [2]:
!pip install -U /kaggle/input/bitsandbytes-0-42-0-py3-none-any-whl/bitsandbytes-0.42.0-py3-none-any.whl -qq

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    AutoConfig,
    set_seed
)

set_seed(42)

MODEL_PATH = "/kaggle/input/deepseek-math"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
#     quantization_config=quantization_config,
    config=config
)

In [None]:
model.dtype

# Load Datasets

In [None]:
import pandas as pd


In [None]:
# https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize/data

# Load the training data
train_data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')

# Load the test data
test_data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')

# Now you can view the first few rows of each to confirm they're loaded correctly
olympiad_data = pd.concat([train_data, test_data], ignore_index=True)



In [None]:
# Clean data
olympiad_data.dropna()
#View 
olympiad_data.head()


In [None]:
# No need to do regex
aime_data = pd.read_csv('/kaggle/input/aime-problem-set-1983-2024/AIME_Dataset_1983_2024.csv')
#Fix structure of columns
# rm Year, Problem Number, Part
aime_data = aime_data.drop(['Year', 'Problem Number', 'Part'], axis=1)
aime_data.dropna()
#rename Question: question, Answer:answer, ID:id
aime_data.rename(columns={'ID': 'id','Question': 'problem', 'Answer': 'answer'}, inplace=True)
#View
aime_data.head()


In [None]:
# clean this
amio_data = pd.read_csv('/kaggle/input/amio-parsed-art-of-problem-solving-website/parsed_ArtOfProblemSolving.csv')
amio_data.head()

patt_to_remove = ['AHSME', 'AJHSME', 'USOMO', 'USAMO', 'USAJMO', 'USOJMO']

# Create a boolean mask where True indicates that a row should be deleted
mask = amio_data['link'].str.contains('|'.join(patt_to_remove))

# Invert the mask to keep rows that do not contain any of the patterns
amio_data = amio_data[~mask]

# Now, 'amio_24_data' contains only the rows where 'link' doesn't include the specified patterns
unique_links = amio_data['link'].unique()
print(unique_links)
#Fix structure of columns
#rm link, letter
amio_data = amio_data.drop(['link', 'letter'], axis=1)
#change problem_id to id
amio_data.rename(columns={'problem_id': 'id'}, inplace=True)
#drop na
amio_data.dropna()
#View
amio_data.head()

In [None]:
combined_data = pd.concat([olympiad_data, aime_data, amio_data, amio_24_data], ignore_index=True)
print(f'Length beofre cleaning: {len(combined_data)}')
# Prioritize the rows that have 'solution' filled out 
combined_data_sorted = combined_data.sort_values(by='solution', ascending=False, na_position='last')
# Drop duplicates
df = combined_data_sorted.drop_duplicates(subset=['problem'], keep='first')
print(f'Length after cleaning: {len(df)}')

# Boolean indexing to filter rows where 'solution' column is not empty
non_empty_solution_rows = df[df['solution'].notnull()]
print(f"Length of those with a 'solution' value: {len(non_empty_solution_rows)}")

# Boolean indexing to filter rows where 'solution' column is empty
empty_solution_rows = df[df['solution'].isnull()]

# Display the length after cleaning
print(f"Length of those with no 'solution' value: {len(empty_solution_rows)}")

# Now 'empty_solution_rows' contains only rows where 'solution' column is empty


In [None]:
PRIVATE = False

In [None]:
from tqdm import tqdm

In [None]:
if not PRIVATE:
    df['answer'] = 0
    iter_test = []
    for i in range(len(df)):
        p = pd.DataFrame(data={'id':[df['id'][i]],'problem':[df['problem'][i]]},index=[0])
        a = pd.DataFrame(data={'id':[df['id'][i]],'answer':[df['answer'][i]]},index=[0])
        iter_test.append((p,a))

In [None]:
import gc
device = 'cuda'

In [None]:
def naive_parse(answer):
    out = []
    start = False
    end = False
    for l in reversed(list(answer)):
        if l in '0123456789' and not end:
            start = True
            out.append(l)
        else:
            if start:
                end = True
        
    out = reversed(out)
    return ''.join(out)

In [None]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype='auto',
    device_map="auto",
)

In [None]:
print(f"Transformers Version: {transformers.__version__}")

In [None]:
import torch

torch.backends.cuda.enable_mem_efficient_sdp(False)

In [None]:
def extract_code(text):
    if text.startswith("```python"):
        text = "hey\n" + text
    blocks = [block.split("```", 1)[0].strip() for block in text.split("```python") if '```' in block]
    blocks = [block for block in blocks if block]
    if not blocks:
        return ""
    code = []
    for block in blocks[:-1]:
        for line in block.split("\n"):
            if line.startswith("    ") or line.startswith("import") or line.startswith("def "):
                code.append(line)
            elif 'print(' not in line:
                code.append(line)
    code = "\n".join(code) + "\n" + blocks[-1]
    return code.strip()

In [None]:
import re
import sys
import subprocess


def process_output(output):
    result = output
    print(output)
    try:
        code = extract_code(output)
        def repl(match):
            if "real" not in match.group():
                return "{}{}".format(match.group()[:-1], ', real=True)')
            else:
                return "{}{}".format(match.group()[:-1], ')')
        code = re.sub(r"symbols\([^)]+\)", repl, code)

        code = code.replace('\n', '\n    ')
            # Add a try...except block
        code = "\ntry:\n    from sympy import *\n    {}\nexcept Exception as e:\n    print(e)\n    print('FAIL')\n".format(code)

        with open('code.py', 'w') as fout:
            fout.write(code)

        batcmd = 'timeout 10 ' + sys.executable + ' code.py'
        try:
            shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
            print(shell_output)
            code_output = round(float(eval(shell_output))) % 1000
        except Exception as ee:
            print('Run failed ',ee)
#             print(code)
            code_output = -1

        print('CODE RESULTS', code_output)
    
    except Exception as e:
        print(e)
        print('ERROR PARSING')
        code_output = -1
    
    try:
        result_output = re.findall(r'\\boxed\{(.*)\}', result)

        print('BOXED', result_output)
        if not len(result_output):
            result_output = naive_parse(result)
        else:
            result_output = result_output[-1]

        print('BOXED', result_output)
        if not len(result_output):
            result_output = -1
        
        else:
            result_output = round(float(eval(result_output))) % 1000
    
    except Exception as e:
        print(e)
        print('ERROR PARSING')
        result_output = -1
    
    return result_output, code_output

In [None]:
import numpy as np
from collections import Counter
import re
from collections import defaultdict

# tool_instruction = "The problem is written in LaTeX format where it uses Established mathematical notation and sometimes Fractional part notation."
# tool_instruction = " The answer should be given as a non-negative modulo 1000."
# tool_instruction += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
# tool_instruction += "If the problem includes combinatorial enumeration,  m^0  represents the number of ways to choose zero objects from a set of m objects. therefore m^0 = 1 for all integers m (including 0)"

code_instruction = "Below is a math problem you are to solve (positive numerical answer):\n\""
code_instruction2 = """To accomplish this, first determine a sympy-based approach for solving the problem by listing each step to take and what functions need to be called in each step. Be clear so even an idiot can follow your instructions, and remember, your final answer should be positive integer, not an algebraic expression!
Write the entire script covering all the steps (use comments and document it well) and print the result. After solving the problem, output the final numerical answer within \\boxed{}.

                        Approach:"""
n_repetitions = 8 if PRIVATE else 2

total_results = []
total_answers = []

for test, submission in iter_test:
    problem = test['problem'][0]
    messages = [
        {
            "role": "user", 
            "content": code_instruction + problem + code_instruction2 
             #problem + tool_instruction 
        }
    ]
    
    query_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False
    )
    
    results = []
    answers = []
    
    for _ in tqdm(range(n_repetitions)):
        try:
            raw_output = pipeline(
                query_prompt, 
                max_new_tokens=2048, 
                do_sample=True, 
                temperature=0.8964,
                return_full_text=False
            )
            raw_output = raw_output[0]['generated_text']
#             print(raw_output)
            result_output, code_output = process_output(raw_output)

            torch.cuda.empty_cache()
            gc.collect()

        except Exception as e:
            print(e)
            result_output, code_output = -1, -1
        
        results.append(result_output)
        answers.append(code_output)
    
    combined = results + answers
    while -1 in combined:
        combined.remove(-1)
    if combined:
        pred = Counter(combined).most_common(2)
        ans = pred[0][0] if not pred[0][0] < 0 or len(pred) < 2 else pred[1][0]
    else:
        ans = -1
    submission['answer'] = ans
    if PRIVATE:
        env.predict(submission)
    
    total_results.append(results)
    total_answers.append(answers)

In [None]:
import numpy as np
from collections import Counter

final_answers = []

for a, b in zip(total_answers, total_results):
    a = np.array(a)
    b = np.array(b)
    a[a < 0] = b[a < 0]
    
    pred = Counter(a.tolist()).most_common(2)

    ans = pred[0][0] if not pred[0][0] < 0 or len(pred) < 2 else pred[1][0]

    final_answers.append(ans)
    print(ans)


In [None]:
# df[['id','answer']].to_csv("submission.csv", header=True, index=False)

In [None]:
if not PRIVATE:
    df = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
    df['model_answer'] = final_answers
    df['match'] = df.answer == df.model_answer
    print(f'{df.match.sum()} matches in {len(df)} examples')
    df.answer