In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import os
from tqdm import tqdm
import copy


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/roberta-large-nli-stsb-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/roberta-large-nli-stsb-mean-tokens').cuda()

In [None]:
cot_pool_path = ""
cot_remain_pool_path = ""
df_finished = pd.read_table(cot_pool_path, sep='\t', encoding='utf-8')
df_unfinished = pd.read_table(cot_remain_pool_path, sep='\t', encoding='utf-8')
questions_finished = list(df_finished['question'])
questions_unfinished = list(df_unfinished['question'])

In [None]:
embed_finished = []
for i in range(0, len(questions_finished), 64):
    batch = questions_finished[i: i + 64]
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    inputs = inputs.to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    sentence_embeddings = mean_pooling(outputs, inputs['attention_mask'])
    embed_finished.append(sentence_embeddings)
embed_finished = torch.concat(embed_finished, axis=0)

embed_unfinished = []
for i in range(0, len(questions_unfinished), 64):
    batch = questions_unfinished[i: i + 64]
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    inputs = inputs.to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    sentence_embeddings = mean_pooling(outputs, inputs['attention_mask'])
    embed_unfinished.append(sentence_embeddings)
embed_unfinished = torch.concat(embed_unfinished, axis=0)
# print(embed_finished.shape, embed_unfinished.shape)
# cosine similarity
similarity = []
for i in tqdm(range(0, embed_finished.size(0), 128)):
    sim = torch.cosine_similarity(embed_unfinished.unsqueeze(1), embed_finished[i: i + 128].unsqueeze(0), dim=-1).cpu()
    similarity.append(sim)
similarity = torch.concat(similarity, axis=-1)
# print(similarity.shape)
indexs = torch.argmax(similarity, axis=-1)

In [None]:
prompt = \
'''Please follow the example to generate your answers (For actions, only use 'Question', 'Multi_Answer_Question' and 'Finish'. Only use special questions when doing action 'Question' and 'Multi_Answer_Question'. Generate your inference steps until action 'Finish'):\n\n{demonstration}\n\nQuestion {question}\nHint {hint}'''
prompt_demon = '''Question {question}\nHint {hint}\n{cot}'''

In [None]:
instances = []
for i, q in enumerate(questions_unfinished):
    most_sim = int(indexs[i])
    dict_most_sim = dict(df_finished.loc[most_sim])
    ground_truth = eval(dict_most_sim['ground_truth'])
    this_hint = {"answer": ground_truth['ground_answer'], "composition_answer": ground_truth['composition_answer']}
    this_prompt = prompt_demon.format(question=dict_most_sim['question'], hint=this_hint, cot=dict_most_sim['gpt_out'])
    all_info = dict(df_unfinished.loc[i])
    hint = {"answer": eval(all_info['ground_answer']), "composition_answer": eval(all_info['compositional_answer'])}
    instance = prompt.format(demonstration=this_prompt, question=q, hint=hint)
    instance = instance.replace("\\n", "\n")
    instance = instance.replace("\n", "\\n")
    instances.append(instance)

In [None]:
save_path = '''Your Save Path'''
df_to_gpt = pd.DataFrame(columns=['question'])
df_to_gpt['question'] = instances
df_to_gpt.to_csv(save_path, index=False, encoding='ut-8')