In [None]:
#数据集链接：
#https://www.kaggle.com/competitions/kaggle-llm-science-exam
#https://www.kaggle.com/datasets/jjinho/blingfire-018/
#https://www.kaggle.com/datasets/mbanaei/datasets-wheel/
#https://www.kaggle.com/datasets/tomokihirose/faiss-gpu-173-python310/
#https://www.kaggle.com/datasets/yingpengchen/gte-base-pca/
#https://www.kaggle.com/datasets/yingpengchen/gte-base-pos/
#https://www.kaggle.com/datasets/datafan07/llm-whls/
#https://www.kaggle.com/datasets/inversion/sentence-transformers-222/
#https://www.kaggle.com/datasets/gmhost/sentencetransformer-hubs/versions/2
#https://www.kaggle.com/datasets/gmhost/wiki-2023-index-partition/
#https://www.kaggle.com/datasets/jjinho/wikipedia-2023-07-faiss-index/
#https://www.kaggle.com/datasets/jjinho/wikipedia-20230701/
#https://www.kaggle.com/datasets/gmhost/wikipedia-stem-index/
#https://www.kaggle.com/datasets/gmhost/wikipedia-stem-plaintext

In [None]:
!cp /kaggle/input/datasets-wheel/datasets-2.14.4-py3-none-any.whl /kaggle/working
!pip install  /kaggle/working/datasets-2.14.4-py3-none-any.whl

In [None]:
# installing offline dependencies
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers
!pip install -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl

!pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl

In [None]:
%%writefile backup.py
import warnings
warnings.simplefilter('ignore')
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index, read_VectorTransform

from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader


def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values,
                        df.document_id.values,
                        df.offset.values,
                        filter_len,
                        disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents),
                                              disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1] - o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0] + offset[0], o[1] + offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)


def get_contexts():
    SIM_MODEL = '/kaggle/input/sentencetransformer-hubs/gte-base'
    DEVICE = 0
    MAX_LENGTH = 384
    BATCH_SIZE = 16

    WIKI_PATH = "/kaggle/input/wikipedia-20230701"
    wiki_files = os.listdir(WIKI_PATH)

    trn = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv").drop("id", 1)

    model = SentenceTransformer(SIM_MODEL, device='cuda')
    model.max_seq_length = MAX_LENGTH
    model = model.half()

    sentence_index = read_index("/kaggle/input/gte-base-pos/wikipedia_gte-base_seq512_title_pos1024_pca.index")

    # prompt_embeddings = model.encode(trn.prompt.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
    prompt_embeddings = model.encode(
        trn.apply(lambda row: f"{row['prompt']}\n{row['A']}\n{row['B']}\n{row['C']}\n{row['D']}\n{row['E']}",
                  axis=1).values,
        batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)

    prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
    pca_mat = read_VectorTransform('/kaggle/input/gte-base-pca/gte-base_pca.mat')
    prompt_embeddings = pca_mat.apply_py(prompt_embeddings)
    _ = gc.collect()

    # Get the top 20 pages that are likely to contain the topic of interest
    search_score, search_index = sentence_index.search(prompt_embeddings, 20)

    # Save memory - delete sentence_index since it is no longer necessary
    del sentence_index
    del prompt_embeddings
    _ = gc.collect()
    libc.malloc_trim(0)

    df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet",
                         columns=['id', 'file'])

    # Get the article and associated file location using the index
    wikipedia_file_data = []

    for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
        scr_idx = idx
        _df = df.loc[scr_idx].copy()
        _df['prompt_id'] = i
        wikipedia_file_data.append(_df)
    wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
    wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(
        ['file', 'id']).reset_index(drop=True)

    # Save memory - delete df since it is no longer necessary
    del df
    _ = gc.collect()
    libc.malloc_trim(0)

    # Get the full text data
    wiki_text_data = []

    for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
        _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file'] == file]['id'].tolist()]
        _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text', 'title'])

        _df_temp = _df[_df['id'].isin(_id)].copy()
        del _df
        _ = gc.collect()
        libc.malloc_trim(0)
        wiki_text_data.append(_df_temp)
    wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
    _ = gc.collect()

    # Parse documents into sentences
    processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)

    # Get embeddings of the wiki text data
    wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
                                        batch_size=BATCH_SIZE,
                                        device=DEVICE,
                                        show_progress_bar=True,
                                        convert_to_tensor=True,
                                        normalize_embeddings=True)  # .half()
    wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()

    _ = gc.collect()

    # Combine all answers
    trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)

    # Search using the prompt and answers to guide the search
    trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']

    question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=DEVICE,
                                       show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
    question_embeddings = question_embeddings.detach().cpu().numpy()

    # Parameter to determine how many relevant sentences to include
    NUM_SENTENCES_INCLUDE = 10

    # List containing just Context
    contexts = []

    for r in tqdm(trn.itertuples(), total=len(trn)):

        prompt_id = r.Index

        prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(
            wikipedia_file_data[wikipedia_file_data['prompt_id'] == prompt_id]['id'].values)].index.values

        if prompt_indices.shape[0] > 0:
            prompt_index = faiss.index_factory(wiki_data_embeddings.shape[1], "Flat")
            prompt_index.add(wiki_data_embeddings[prompt_indices])

            context = ""

            # Get the top matches
            ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
            for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
                context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "
        contexts.append(context)

    trn['context'] = contexts

    trn[["prompt", "context", "A", "B", "C", "D", "E"]].to_csv("./test_context.csv", index=False)


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch


def generate_openbook_output():
    #import numpy as np
    test_df = pd.read_csv("./test_context.csv")
    test_df.index = list(range(len(test_df)))
    test_df['id'] = list(range(len(test_df)))
    #test_df["prompt"] = test_df["context"].apply(lambda x: x[:1750]) + " #### " + test_df["prompt"]
    test_df['answer'] = 'A'
    model_dir = "/kaggle/input/llm-science-run-context-2"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
    model.eval()

    # We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
    options = 'ABCDE'
    indices = list(range(5))

    option_to_index = {option: index for option, index in zip(options, indices)}
    index_to_option = {index: option for option, index in zip(options, indices)}

    def preprocess(example):
        first_sentence = [ "[CLS] " + example['context'] ] * 5
        second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                      max_length=1536, add_special_tokens=False)
        tokenized_example['label'] = option_to_index[example['answer']]

        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df[['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer']]).map(preprocess, remove_columns=['prompt', 'context','A', 'B', 'C', 'D', 'E', 'answer'])
    tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
    data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
    test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator,num_workers=2, pin_memory=False,)

    test_predictions = []
    for batch in tqdm(test_dataloader, total=len(test_dataloader)):
        for k in batch.keys():
            batch[k] = batch[k].cuda()
        with torch.no_grad():
            outputs = model(**batch)
        test_predictions.append(outputs.logits.cpu().detach())

    test_predictions = torch.cat(test_predictions)

    predictions_as_ids = np.argsort(-test_predictions, 1)

    predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
    # predictions_as_answer_letters[:3]

    predictions_as_string = test_df['prediction'] = [
        ' '.join(row) for row in predictions_as_answer_letters[:, :3]
    ]

    submission = test_df[['id', 'prediction']]
    submission.to_csv('submission_backup.csv', index=False)
    
get_contexts()
generate_openbook_output()

In [None]:
!python backup.py

In [None]:
import pandas as pd
backup_model_predictions = pd.read_csv("submission_backup.csv")

In [None]:
%%writefile parsed_context_search.py
import warnings
warnings.simplefilter('ignore')
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index, read_VectorTransform

from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

class CFG:
    EMB_MODEL = "/kaggle/input/sentencetransformer-hubs/gte-base"
    INDEX_PATH = "/kaggle/input/wikipedia-stem-index/parsed_gte-base.index"
    WIKI_PLAINTEXT_PATH = "/kaggle/input/wikipedia-stem-plaintext/parsed.parquet"
    
    MAX_LENGTH = 512
    BATCH_SIZE = 32
    MAX_DOC_NUM = 10
    
    DEBUG = False

if CFG.DEBUG:
    trn = pd.read_csv("/kaggle/input/llm-stem-validationset/STEM_valid.csv")#.drop("context", 1)
else:
    trn = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")#.drop("id", 1)

## Combine all answers
trn = trn.fillna('None')
trn['answer_all'] = trn.apply(lambda x: " ".join([str(x['A']), str(x['B']), str(x['C']), str(x['D']), str(x['E'])]), axis=1)
## Search using the prompt and answers to guide the search
trn['prompt_answer_stem'] = trn['prompt'] + " " +trn['prompt'] + " " +trn['prompt'] + " " + trn['answer_all']

model = SentenceTransformer(CFG.EMB_MODEL, device='cuda')
model.max_seq_length = CFG.MAX_LENGTH

prompt_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=CFG.BATCH_SIZE, device=0, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()

res = faiss.StandardGpuResources()
sentence_index = read_index(CFG.INDEX_PATH)
sentence_index_gpu = faiss.index_cpu_to_gpu(res, 0, sentence_index)

QUERY_SIZE = 8
search_score = []
search_index = []
total = prompt_embeddings.shape[0]
for i in tqdm(range(0, total, QUERY_SIZE)):
    ss, si = sentence_index_gpu.search(prompt_embeddings[i:i+QUERY_SIZE], CFG.MAX_DOC_NUM)
    search_score.append(ss)
    search_index.append(si)

search_score = np.concatenate(search_score)
search_index = np.concatenate(search_index)

## Save memory - delete sentence_index since it is no longer necessary
del sentence_index
del sentence_index_gpu
del prompt_embeddings
#del model
_ = gc.collect()
#torch.cuda.empty_cache()
libc.malloc_trim(0)

df = pd.read_parquet(CFG.WIKI_PLAINTEXT_PATH,
                     columns=['text'])

contexts = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    context = ""
    scr_idx = idx
    context_list = df.loc[scr_idx].text.tolist()
    context += " ".join(context_list)
    contexts.append(context)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

trn['parsed_context'] = contexts

save_cols = ["prompt", "parsed_context", "A", "B", "C", "D", "E"]
trn[save_cols].to_csv("./test_context_part1.csv", index=False)

In [None]:
%%time
!python parsed_context_search.py

In [None]:
%%writefile cohere_context_search.py
import warnings
warnings.simplefilter('ignore')
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index, read_VectorTransform

from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

class CFG:
    EMB_MODEL = "/kaggle/input/sentencetransformer-hubs/gte-large"
    INDEX_PATH = "/kaggle/input/wikipedia-stem-pca-index/parsed_gte-large.index"
    WIKI_PLAINTEXT_PATH = "/kaggle/input/wikipedia-stem-plaintext/parsed.parquet"
    
    MAX_LENGTH = 512
    BATCH_SIZE = 16
    MAX_DOC_NUM = 10
    
    DEBUG = False

trn = pd.read_csv("./test_context_part1.csv")#.drop("id", 1)

## Combine all answers
trn = trn.fillna('None')
trn['answer_all'] = trn.apply(lambda x: " ".join([str(x['A']), str(x['B']), str(x['C']), str(x['D']), str(x['E'])]), axis=1)
## Search using the prompt and answers to guide the search
trn['prompt_answer_stem'] = trn['prompt'] + " " +trn['prompt'] + " " +trn['prompt'] + " " + trn['answer_all']

model = SentenceTransformer(CFG.EMB_MODEL, device='cuda')
model.max_seq_length = CFG.MAX_LENGTH

prompt_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=CFG.BATCH_SIZE, device=0, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
pca_mat = read_VectorTransform('/kaggle/input/wikipedia-stem-pca-index/parsed_gte-large_pca.mat')
prompt_embeddings = pca_mat.apply_py(prompt_embeddings)
res = faiss.StandardGpuResources()
sentence_index = read_index(CFG.INDEX_PATH)
sentence_index_gpu = faiss.index_cpu_to_gpu(res, 0, sentence_index)

QUERY_SIZE = 8
search_score = []
search_index = []
total = prompt_embeddings.shape[0]
for i in tqdm(range(0, total, QUERY_SIZE)):
    ss, si = sentence_index_gpu.search(prompt_embeddings[i:i+QUERY_SIZE], CFG.MAX_DOC_NUM)
    search_score.append(ss)
    search_index.append(si)

search_score = np.concatenate(search_score)
search_index = np.concatenate(search_index)

## Save memory - delete sentence_index since it is no longer necessary
del sentence_index
del sentence_index_gpu
del prompt_embeddings
#del model
_ = gc.collect()
#torch.cuda.empty_cache()
libc.malloc_trim(0)

df = pd.read_parquet(CFG.WIKI_PLAINTEXT_PATH,
                     columns=['text'])

contexts = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    context = ""
    scr_idx = idx
    context_list = df.loc[scr_idx].text.tolist()
    context += " ".join(context_list)
    contexts.append(context)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

trn['cohere_context'] = contexts

save_cols = ["prompt", 'parsed_context', "cohere_context", "A", "B", "C", "D", "E"]
trn[save_cols].to_csv("./test_context.csv", index=False)

In [None]:
!python cohere_context_search.py

In [None]:
#%%time 
import warnings
warnings.simplefilter('ignore')
from dataclasses import dataclass
from typing import Optional, Union

import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
PRETAIN = "/kaggle/input/llm-deberta-models/hf_gte_base_context/checkpoint-35500/"
max_length = 1536

test = pd.read_csv("test_context.csv")
test.index = list(range(len(test)))
test.id = list(range(len(test)))
test['answer'] = 'A'

tokenizer = AutoTokenizer.from_pretrained(PRETAIN)
model = AutoModelForMultipleChoice.from_pretrained(PRETAIN).cuda()
model = model.half()
model.eval();

def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}
def preprocess(example):
    first_sentence = [ "[CLS] " + example['context'] ] * 5
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=max_length, add_special_tokens=False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch
    
test_df = test[['prompt', 'parsed_context', 'A', 'B', 'C', 'D', 'E', 'answer']].rename(columns={'parsed_context':"context"})
tokenized_test_dataset = Dataset.from_pandas(test_df[['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer']]).map(preprocess, remove_columns=['prompt', 'context','A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator,num_workers=2,
    pin_memory=False,)
test_predictions = []
for batch in tqdm(test_dataloader):
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_predictions.append(outputs.logits.cpu().detach())

test_predictions = torch.cat(test_predictions)
predictions1 = test_predictions.numpy()

test_df = test[['prompt', 'cohere_context', 'A', 'B', 'C', 'D', 'E', 'answer']].rename(columns={'cohere_context':"context"})
tokenized_test_dataset = Dataset.from_pandas(test_df[['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer']]).map(preprocess, remove_columns=['prompt', 'context','A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator,num_workers=2,
    pin_memory=False,)
test_predictions = []
for batch in tqdm(test_dataloader):
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_predictions.append(outputs.logits.cpu().detach())

test_predictions = torch.cat(test_predictions)
predictions2 = test_predictions.numpy()

In [None]:
predictions2 = predictions2.astype('float32')
predictions1 = predictions1.astype('float32')

In [None]:
predictions = []
submit_ids = []

for index in tqdm(range(len(test))):
    probability1 = torch.softmax(torch.tensor(predictions1[index]), dim=-1)
    probability2 = torch.softmax(torch.tensor(predictions2[index]), dim=-1)
        
    probability_ = (probability1 + probability2)/2

    if probability_.max() > 0.4:
        predict = np.array(list("ABCDE"))[np.argsort(probability_)][-3:].tolist()[::-1]
    else:
        predict = backup_model_predictions.iloc[index].prediction.replace(" ","")
    predictions.append(predict)

predictions = [" ".join(i) for i in predictions]

In [None]:
submission = pd.DataFrame({'id':range(len(test)),'prediction':predictions})
submission.to_csv('submission.csv', index=False)

import numpy as np
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u].split()
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

if len(submission) == 200:
    train = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')
    preds = [pred for pred in submission['prediction']]
    print(MAP_at_3(preds, train["answer"]))