In [1]:
!pip install transformers datasets huggingface_hub torch tqdm psutil sentencepiece evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import numpy as np # linear algebra
import pandas as pd
import json
import os
import evaluate
from typing import List, Dict, Any
import torch
from tqdm import tqdm
from torch.utils import data

class Dataset(object):
    def __init__(
        self,
        dataset_filepath: str,
    ):
        self.dataset = []
        self.dataset = pd.read_csv(dataset_filepath).to_dict('records')
        for dp in self.dataset:
            if not dp['answer_choices'] or dp['answer_choices'] != dp['answer_choices']:
                del dp['answer_choices']
        print(self.dataset[0])

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

def collate_fn(batch_of_datapoints: List[Dict]) -> Dict[Any, List]:
    """
    Convert a batch of datapoints into a datapoint that is batched. This is meant to override the default collate function in pytorch and specifically can handle when the value is a list

    Args:
        batch_ofDatapoints:

    Returns:

    """
    datapoint_batched = {}
    for datapoint in batch_of_datapoints:
        # Gather together all the values per key
        for key, value in datapoint.items():
            if key in datapoint_batched:
                datapoint_batched[key].append(value)
            else:
                datapoint_batched[key] = [value]
    return datapoint_batched

In [3]:
def convert_dict_of_lists_to_list_of_dicts(dict_of_lists: Dict[Any, List]) -> List[Dict]:
    """
    Args:
        dict_of_lists:

    Returns:
        list_ofDict
    """
    list_of_dicts = []
    for datapoint_values in zip(*dict_of_lists.values()):
        list_of_dicts.append(dict(zip(dict_of_lists, datapoint_values)))
    return list_of_dicts

In [4]:
def preprocess(test_df):
    res_df = []
    for each in test_df:
        options = []
        for opt in each["options"]:
            if opt == "N/A":
                continue
            options.append(opt)
        each["options"] = options
        res_df.append(each)
    return res_df

In [5]:
def format_cot_example(example, including_answer=True):
    prompt = "Question:\n"
    question = example["question"]
    options = example["options"]
    prompt += question + "\n"
    prompt += "Options:\n"
    for i, opt in enumerate(options):
        prompt += "{}. {}\n".format(choices[i], opt)
    if including_answer:
        cot_content = example["cot_content"].replace("A: Let's think step by step.",
                                                     "Answer: Let's think step by step.")
        prompt += cot_content + "\n\n"
    else:
        prompt += "Answer: Let's think step by step."
    return prompt

In [6]:
import ast
def format_curr(example):
    prompt = ""
    question = example["input"][0][:-8]
    options = ast.literal_eval(example["answer_choices"][0])
    prompt += question + "\n"
    prompt += "Options:\n"
    for i, opt in enumerate(options):
        prompt += "{}. {}\n".format(choices[i], opt)
    prompt += "Answer: Let's think step by step."
    return prompt

In [7]:
def extract_final(text):
    pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(0)
    else:
        return 'A'

In [8]:
def extract_again(text):
    match = re.search(r'.*[aA]nswer:\s*([A-J])', text)
    if match:
        return match.group(1)
    else:
        return extract_final(text)

In [9]:
import re
def extract_answer(text):
    pattern = r"answer is \(?([A-J])\)?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        print("1st answer extract failed\n" + text)
        return extract_again(text)

In [10]:
choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
def ge_instruct(val_df, count):
  prompt = "The following are multiple choice questions. Think step by" \
             " step and then output the answer in the format of \"The answer is (X)\" at the end.\n\n"

  num = 0
  res = []
  stored_cat = set()
  for each in val_df:
    category = each["category"]
    if category not in stored_cat:
      stored_cat.add(category)
      num += 1
      res.append(each)
    if num == count:
      break
  for example in res:
    prompt += format_cot_example(example, including_answer=True)
  return prompt

In [11]:
def generate_my_prompt(prompt, curr):
  prompt += format_curr(curr)
  return prompt

In [12]:
def slerp_generate(text_input,max_new_tokens):
    system_prompt='In this task you are given a question. You need to generate an answer to the question. \n Input:Who was the man behind The Chipmunks? \nOutput: David Seville.\nInput: Question:On 2 November 2010, the oil painting ""Nude Sitting on a Divan"" sold for $68.9 million, a record for an artwork by which artist? Output:\n'
    messages = [
    {"role": "user", "content": system_prompt},
    {"role": "assistant", "content": "Amedeo Modigliani"},
    {"role": "user", "content":text_input},
    ]
    #messages = [ 
    #{"role": "system", "content": "You are a helpful AI assistant."}, 
   #{"role": "User", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}, 
   #{"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."}, 
    #{"role": "user", "content": text_input}, 
#] 
 
  #  messages = [
  # {"role": "user", "content":text_input},
 #]
    text_input = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
    )
    inputs = tokenizer([text_input],  add_special_tokens = False,  return_tensors ='pt' ).to(device)
    outputs = model.generate(**inputs, #input_ids=inputs.to(device),
                                   max_new_tokens=max_new_tokens,
                                   #temperature=0.3, #value used to modulate the next token probabilities.
                                   #temperature=1,#, top_k=50, top_p=0.95,
                                   #num_beams=1, top_k = 50,
                                   do_sample= False,
                                   pad_token_id=tokenizer.pad_token_id,eos_token_id=tokenizer.eos_token_id,
                                   #top_p =0.95,#eos_token_id=[2,],
                                   #num_return_sequences = 1, repetition_penalty=1,
                                  )
    outputs=outputs[:, inputs["input_ids"].shape[1]:]
    output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
    return output

In [13]:
def slerp_multichoice(text_input,max_new_tokens):
    messages = [
    {"role": "system", "content": """The following are multiple choice questions. Think step by step and then output the answer in the format of "The answer is (X)" at the end."""},
    {"role": "User", "content":text_input},
]
    #text_input = tokenizer.apply_chat_template(
    #        messages,
    #        tokenize=False,
    #       add_generation_prompt=True
    #)
    #print(text_input)
    inputs = tokenizer(text_input,  add_special_tokens = False,  return_tensors ='pt' ).to(device)
    outputs = model.generate(**inputs, #input_ids=inputs.to(device),
                                   max_new_tokens=max_new_tokens,
                                   #temperature=1, #value used to modulate the next token probabilities.
                                   #num_beams=1, top_k = 50,
                                   do_sample = False,
                                   pad_token_id=tokenizer.pad_token_id,eos_token_id=tokenizer.eos_token_id,
                                   #top_p =0.9,eos_token_id=[2,],
                                   #num_return_sequences = 256, repetition_penalty=1,
                                  )
    outputs=outputs[:, inputs["input_ids"].shape[1]:]
    output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
    print(output)
    pred = extract_answer(output[0])
    return pred

In [14]:
dataset_filepath = "/kaggle/input/llm-merging-competition/test.csv" #TODO
data_loader = data.DataLoader(
        Dataset(dataset_filepath),
        batch_size=1,
        num_workers=0,
        shuffle=False,
        collate_fn=collate_fn
    )

{'id': 0, 'input': 'Write an ad copy for a new product, a digital photo frame that connects to your social media accounts and displays your photos. Respond with at most 150 words.', 'eval_type': 'generation'}


In [15]:
instruct = """
The following are multiple choice questions. Think step by step and then output the answer in the format of "The answer is (X)" at the end.Question:\nThe symmetric group $S_n$ has $\n\\factorial{n}$ elements, hence it is not true that $S_{10}$ has 10 elements.\nFind the characteristic of the ring 2Z.\nOptions:\nA. 0\nB. 30\nC. 3\nD. 10\nE. 12\nF. 50\nG. 2\nH. 100\nI. 20\nJ. 5\nAnswer: Let\'s think step by step. A characteristic of a ring is R is $n$ if the statement $ka = 0$ for all $a\\in 2Z$ implies that $k$ is a multiple of $n$. Assume that $ka = 0$ for all $a\\in 2Z$ for some $k$. In particular $2k = 0$. Hence $k=0$ and $n=0$. The answer is (A).\n\nQuestion:\nWhich of the following is the body cavity that contains the pituitary gland?\nOptions:\nA. Ventral\nB. Dorsal\nC. Buccal\nD. Thoracic\nE. Pericardial\nF. Abdominal\nG. Spinal\nH. Pelvic\nI. Pleural\nJ. Cranial\nAnswer: Let\'s think step by step. We refer to Wikipedia articles on anatomy for help. Let’s solve this problem step by step. The pituitary gland is the major endocrine gland attached to the base of the brain, and it is contained in the Cranial cavity. The answer is (J).\n\nQuestion:\nSay the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?\nOptions:\nA. 1000 times more\nB. 50 times more\nC. 5000 times more\nD. 500 times more\nE. 10000 times more\nF. 20000 times more\nG. 2000 times more\nH. 100 times more\nI. 10 times more\nAnswer: Let\'s think step by step. The amount of light is proportional to the aperture area $A = \\pi D^2/4$ for a lens with diameter $D$, so the relative amounts of light between the eye with diameter 5mm and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2 = 10000$. The answer is (E).\n\n
"""

In [16]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)
model_id = "catrinbaze/llama-refueled-merge"
#The model I merged myself using MergeKit only employed the SLERP method. 
#Both parent models used for the merge were released before May 31. They are directly loaded from Huggingface, and the detailed merging method is described below. 
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    trust_remote_code=True,
)
assert torch.cuda.is_available(), "This model needs a GPU to run ..."
device = torch.cuda.current_device()
model = model.to(device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

AssertionError: This model needs a GPU to run ...

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Updated to use CPU as fallback
model.to(device)
model.eval()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
all_batches = []
with torch.no_grad():
        for batch in tqdm(data_loader):
                eval_type = batch["eval_type"][0]
                if eval_type == "generation":
                        input_text = batch['input'][0]
                        #print(input_text)
                        output = slerp_generate(input_text,512)
                        batch["prediction"] = output
                        #print(output)
                        torch.cuda.empty_cache()
                        all_batches.extend(convert_dict_of_lists_to_list_of_dicts(batch))
                else:
                        #break
                        assert eval_type == "multiple_choice"
                        input_text = generate_my_prompt(instruct, batch)
                        #print(input_text)
                        output = slerp_multichoice(input_text,512)
                        #print(output)
                        batch["prediction"] = [output[0]]
                        #print(batch["prediction"])
                        torch.cuda.empty_cache()
                        all_batches.extend(convert_dict_of_lists_to_list_of_dicts(batch))

In [None]:
choice_to_number = {choice: str(index) for index, choice in enumerate(choices)}
choice_to_number

In [None]:
dataset_predictions = all_batches
dp_df = pd.DataFrame(dataset_predictions)
dp_df["dummy_field"] = 0
dp_df.loc[dp_df['eval_type'] == 'multiple_choice', 'prediction'] = dp_df['prediction'].map(choice_to_number)
#fn = os.path.basename(dataset_filepath)
dp_df.to_csv("/kaggle/working/submission.csv", columns=["id", "prediction", "dummy_field"], index=False, encoding='utf-8-sig', errors='replace')

# merging with Mergekit

In [None]:
!git clone https://github.com/cg123/mergekit.git
!cd mergekit && pip install -q -e .

In [None]:
import yaml

MODEL_NAME = "llama3_refuled_orca_slerp"
yaml_config = """
slices:
  - sources:
      - model: NousResearch/Meta-Llama-3-8B-Instruct
        layer_range: [0, 32]
      - model: refuelai/Llama-3-Refueled
        layer_range: [0, 32]
merge_method: slerp
base_model: refuelai/Llama-3-Refueled
parameters:
  t:
    - filter: self_attn
      value: [0, 0.5, 0.3, 0.7, 1]
    - filter: mlp
      value: [1, 0.5, 0.7, 0.3, 0]
    - value: 0.5
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config)

In [None]:
# Merge models
!mergekit-yaml config.yaml merge --copy-tokenizer --allow-crimes --out-shard-size 1B --lazy-unpickle