# Load model

In [1]:
!pip install /kaggle/input/bitsandbytez/accelerate-0.28.0-py3-none-any.whl
!pip install /kaggle/input/bitsandbytez/bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl
!pip install peft
!pip install git+https://github.com/jiaweizzhao/GaLore

Processing /kaggle/input/bitsandbytez/accelerate-0.28.0-py3-none-any.whl
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.29.3
    Uninstalling accelerate-0.29.3:
      Successfully uninstalled accelerate-0.29.3
Successfully installed accelerate-0.28.0
Processing /kaggle/input/bitsandbytez/bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.0
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.10.0
Collecting git+https://github.com/jiaweizzhao/GaLore
  Cloning https://github.com/jiaweizzhao/GaLore to /tmp/pip-req-build-rk_pvr8e
  Running command git

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    AutoConfig,
    set_seed
)

set_seed(42)
MODEL_PATH = "/kaggle/input/gemma-2b-it"
config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side="right")


model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    config=config
)

2024-05-07 13:24:22.226439: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-07 13:24:22.226560: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-07 13:24:22.386535: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load Datasets

In [3]:
import pandas as pd

In [4]:
# clean this
amio_data = pd.read_csv('/kaggle/input/amio-parsed-art-of-problem-solving-website/parsed_ArtOfProblemSolving.csv', index_col=False)
amio_data.head()

patt_to_remove = ['AHSME', 'AJHSME', 'USOMO', 'USAMO', 'USAJMO', 'USOJMO']

# Create a boolean mask where True indicates that a row should be deleted
mask = amio_data['link'].str.contains('|'.join(patt_to_remove))

# Invert the mask to keep rows that do not contain any of the patterns
amio_data = amio_data[~mask]

# Now, 'amio_24_data' contains only the rows where 'link' doesn't include the specified patterns
unique_links = amio_data['link'].unique()
print(unique_links)
#Fix structure of columns
#rm link, letter
amio_data = amio_data.drop(['link', 'letter'], axis=1)
#change problem_id to id
amio_data.rename(columns={'problem_id': 'id'}, inplace=True)
#drop na
amio_data.dropna()
#View
amio_data.head()

['https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_1'
 'https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_2'
 'https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_3'
 ...
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_13'
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_14'
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_15']


Unnamed: 0,id,problem,solution,answer
0,4ba30954e5f3ca72748b3e145f45b705,"What is the ones digit of \[222,222-22,222-2,2...","We can rewrite the expression as \[222,222-(22...",2.0
1,4ba30954e5f3ca72748b3e145f45b705,"What is the ones digit of \[222,222-22,222-2,2...","222,222-22,222 = 200,000\n200,000 - 2,222 = 19...",2.0
2,4ba30954e5f3ca72748b3e145f45b705,"What is the ones digit of \[222,222-22,222-2,2...","We only care about the unit's digits.\nThus, $...",2.0
3,4ba30954e5f3ca72748b3e145f45b705,"What is the ones digit of \[222,222-22,222-2,2...",We just take the units digit of each and subtr...,2.0
4,085955dda8dfb374689b3f216b54d785,What is the value of this expression in decima...,We see that $\frac{44}{11}$ is $4$ $\frac{110}...,6.54


In [5]:
combined_data = amio_data
print(f'Length before cleaning: {len(combined_data)}')
# Prioritize the rows that have 'solution' filled out 
combined_data_sorted = combined_data.sort_values(by='solution', ascending=False, na_position='last')
# Drop duplicates
df = combined_data_sorted.drop_duplicates(subset=['problem'], keep='first')
print(f'Length after cleaning{len(df)}')

Length before cleaning: 7879
Length after cleaning2656


In [6]:
!pip install datasets



In [7]:
from datasets import Dataset
f = Dataset.from_pandas(df)
ds = f.train_test_split(test_size=0.2)

In [8]:
ds = ds.filter(lambda x: max([len(x["problem"]),len(x["solution"])])<=720)

Filter:   0%|          | 0/2124 [00:00<?, ? examples/s]

Filter:   0%|          | 0/532 [00:00<?, ? examples/s]

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'problem', 'solution', 'answer', '__index_level_0__'],
        num_rows: 1246
    })
    test: Dataset({
        features: ['id', 'problem', 'solution', 'answer', '__index_level_0__'],
        num_rows: 320
    })
})

# Tokenize

In [10]:
batch_size=6

In [11]:
def tokenize_function(example):
    return tokenizer(text=example["problem"], text_target=example["solution"], padding="max_length",max_length=400)

In [12]:
from transformers import DataCollatorWithPadding
ds = ds.map(tokenize_function, batched=True, remove_columns=ds["train"].column_names)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/1246 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

In [13]:
max([len(ds["train"]["input_ids"][i]) for i in range(len(ds))])

400

In [14]:
max([len(ds["train"]["labels"][i]) for i in range(len(ds))])

400

In [15]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    ds["train"], shuffle=True, collate_fn=data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(ds["test"], collate_fn=data_collator, batch_size=batch_size, pin_memory=True)

# Define hyperparameter

In [16]:
lr = 1e-5
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)

# Fine-tuned gemma-2b

In [17]:
from tqdm.auto import tqdm

In [18]:
import gc
device = 'cuda'

In [19]:
!pip install /kaggle/input/galore-torch/galore_torch-1.0-py3-none-any.whl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Processing /kaggle/input/galore-torch/galore_torch-1.0-py3-none-any.whl
galore-torch is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [20]:
from peft import LoraConfig

In [21]:
lora_config = LoraConfig(
    bias="none",
    target_modules="all-linear",
    init_lora_weights=True,
    task_type="CAUSAL_LM",
)

In [22]:
model.add_adapter(lora_config)

In [23]:
from galore_torch import GaLoreAdamW8bit
# define param groups as galore_params and non_galore_params
optimizer = GaLoreAdamW8bit(model.parameters(), lr=lr)

In [24]:
from transformers import get_scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [25]:
from accelerate import Accelerator
accelerater = Accelerator()
accelerater.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

(<accelerate.data_loader.DataLoaderShard at 0x790c922d6470>,
 <accelerate.data_loader.DataLoaderShard at 0x790c922d7490>,
 GemmaForCausalLM(
   (model): GemmaModel(
     (embed_tokens): Embedding(256000, 2048, padding_idx=0)
     (layers): ModuleList(
       (0-17): 18 x GemmaDecoderLayer(
         (self_attn): GemmaSdpaAttention(
           (q_proj): lora.Linear4bit(
             (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
             (lora_dropout): ModuleDict(
               (default): Identity()
             )
             (lora_A): ModuleDict(
               (default): Linear(in_features=2048, out_features=8, bias=False)
             )
             (lora_B): ModuleDict(
               (default): Linear(in_features=8, out_features=2048, bias=False)
             )
             (lora_embedding_A): ParameterDict()
             (lora_embedding_B): ParameterDict()
           )
           (k_proj): lora.Linear4bit(
             (base_layer): Linear4bit(in_f

In [26]:
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerater.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/208 [00:00<?, ?it/s]

In [27]:
model.save_pretrained("/kaggle/working/")



# Inference

In [None]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype='auto',
    device_map="auto",
)

In [None]:
import re
from collections import defaultdict


tool_instruction = " The answer should be given as a non-negative modulo 1000."
tool_instruction += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'


n_repetitions = 1
temperature = 0.8964

total_results = []
total_answers = []


for i in tqdm(range(len(df))):
    id_ = df['id'].loc[i]
    problem = df['problem'].loc[i]

    messages = [
        {
            "role": "user", 
            "content": problem + tool_instruction
        }
    ]
    
    query_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False
    )
    
    results = []
    answers = []
     
    
    for _ in tqdm(range(n_repetitions)):
        try:
            raw_output = pipeline(
                query_prompt, 
                max_new_tokens=2048, 
                do_sample=True, 
                temperature=temperature,
                return_full_text=False
            )
            raw_output = raw_output[0]['generated_text']

            result_output, code_output = process_output(raw_output)

            torch.cuda.empty_cache()
            gc.collect()

        except Exception as e:
            print(e)
            result_output, code_output = -1, -1
        
        results.append(result_output)
        answers.append(code_output)
    
    total_results.append(results)
    total_answers.append(answers)

In [None]:
import numpy as np
from collections import Counter

df['leng'] = df['problem'].astype(str).map(len)
df['orig_index'] = df.index.values
df = df.sort_values(by=['leng', 'id']).reset_index(drop=True)
df['enumerates'] = range(0, len(df))
df = df.sort_values('orig_index').reset_index(drop=True)

enumerate_i = 0
final_answers = []
for a, b in zip(total_answers, total_results):
    a = np.array(a)
    b = np.array(b)
    a[a < 0] = b[a < 0]
    pred = Counter(a.tolist()).most_common(2)
    pred = pred + [(-1,0)]
    val_previously, freq_previously = pred[0]
    for val, freq in pred[1:]: 
        if freq == freq_previously:
            val_previously = min(val_previously,val )
    enumerates = df.enumerates.values[enumerate_i]
    ans = val_previously if not val_previously < 0 else pred[1][0]
    enumerate_i+= 1    
    final_answers.append(ans)
    print(ans)

In [None]:
df['answer'] = final_answers

In [None]:
df[['id','answer']].to_csv("submission.csv", header=True, index=False)