In [5]:
#Code for sequence-level-voting ensemble using my designed scoring mechanism
import torch
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
from collections import defaultdict , Counter
import numpy as np
import math

NUMBER_of_components = int(5)
TOKENIZER = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
TOKENIZER.add_special_tokens({'pad_token': '[PAD]'})
TOKENIZER.padding_side="left"

def group_tokenized_responses( tokenized_responses, inputs_log_prob):
        # Dictionary to group indices of identical tokenized responses
        groups = defaultdict(list)

        # Iterate over the tokenized responses with their indices
        for i, tokens in enumerate(tokenized_responses):
            # Convert tokens to a tuple (hashable) to use as a dictionary key
            token_bag_with_counts = tuple(sorted(Counter(tokens).items()))
            groups[token_bag_with_counts].append(i)

        # print('groups.keys(): ', groups.keys())
        # print('groups: ' , groups)
        # Convert the grouped dictionary values into a list of groups
        grouped_indices = list(groups.values())

        # Calculate the average log probabilities for each group and assign them
        for group in grouped_indices:
            # Extract log probabilities for the current group
            group_probs = [
                inputs_log_prob[int(idx % NUMBER_of_components)][ 0, int(idx / NUMBER_of_components)].item()
                for idx in group
            ]

            # Calculate the average probability for the group
            avg_prob = np.mean(group_probs)

            # Assign the average probability back to each index in the group
            for idx in group:
                inputs_log_prob[int(idx % NUMBER_of_components)][ 0, int(idx / NUMBER_of_components)] = avg_prob

        return grouped_indices, inputs_log_prob

    #calculating the similarity score of the two tokenized sentences.
def similarity_meassure( hypothesis_tokens , reference_tokens, in_table_keywords):

    # hypothesis_tokens = [token for token in hypothesis_tokens if token.upper() not in self.sql_keywords]
    # reference_tokens = [token for token in reference_tokens if token.upper() not in self.sql_keywords]

    hypothesis_counts = Counter(hypothesis_tokens)
    reference_counts = Counter(reference_tokens)
    clipped_counts = dict()

    for token in hypothesis_tokens:
        if token not in reference_counts.keys():
            reference_counts[token] = 0
        if token in in_table_keywords:
            clipped_counts[token] = min(hypothesis_counts[token], reference_counts[token])
        else:
            clipped_counts[token] = min(hypothesis_counts[token], reference_counts[token])/2
    
    total_clipped = sum(clipped_counts.values())

    similarity = (total_clipped*2+1) / ( len(hypothesis_tokens) + len(reference_tokens) + 1 )
    
    return math.log(similarity)

def ensemble( inputs_ids , inputs_log_prob , starting_batch_input_len, batch_text ):
    #function for performing ensemble using the bleu metric between the candidate sequences.
    #input:
        #inputs_ids: list of torch tensor representing the prompt tokens per component with shape (input_len , num_beam)
        #inputs_log_prob: list of torch tensor representing the probability of each input_ids with the shape( 1 , num_beam)
        #starting_batch_input_len: list of integers. Indicating the start of the generated text in the input_ids (prompt+genText)
        #batch_text: list of string with len()=number_components. Having the prompts for each components.
    #return:
        #ensembled_inputs_ids: torch tensor with size (batch_size , input_len , num_beam)
        #ensembled_inputs_log_prob: torch tensor with size (batch_size , 1 , num_beam)
    # print('extra_added_paddings before ensemble: ' , extra_added_paddings)
    batch_size = len(inputs_ids)
    num_beam = inputs_ids[0].size( dim=-1 )
    ensembled_inputs_ids = [  ] #inputs_ids.clone() #?
    ensembled_inputs_log_prob = [  ] #inputs_log_prob.clone() #?
    
    table_creation_part_prompt = batch_text[0].split('Given the following database schema:')[-1].split('Answer the following')[0]
    # print('table_creation_part_prompt: ', table_creation_part_prompt)
    in_table_keywords = word_tokenize( table_creation_part_prompt )
    # print('in_table_keywords: ' , in_table_keywords)
    tokenized_responses = []
    decoded_text_list = []
    #Tokenizing the candidate sequences
    for j in range(num_beam):
        components_token_list = []
        for component in range(0,NUMBER_of_components,1):
            # print('component: ' , component)
            # print('starting_batch_input_len[component]: ' , starting_batch_input_len[component])
            # print( 'inputs_ids[ component ].size(0): ' , inputs_ids[ component ].size(0) )
            components_token_list.append( inputs_ids[ component ][:, j ] )
        decoded_text_list.extend( TOKENIZER.batch_decode( components_token_list ,
                                        skip_special_tokens=True ) )
    for text in decoded_text_list: #number of candidates we have for each question
        # print(text)
        tokenized_responses.append( word_tokenize( text.replace('.' , ' ') ) )
    selection_score_list = [] #This scoring is used to select the best candidates. It uses the length penalty to calculate the scores
    updating_score_list = [] #This scoring is used to update the score of selected candidates. It does not use the length penalty to calculate the scores

    #Finding the identical candidates, take the average of their probability, and only keep one of them with the average probability assigned to it.
    grouped_indices, inputs_log_prob = group_tokenized_responses( tokenized_responses, inputs_log_prob)
    tmp_input_log_prob = []
    for input_id , starting_gen, input_log_prob in zip(inputs_ids,starting_batch_input_len,inputs_log_prob):
        gen_text_len = input_id.size(0) - starting_gen
        # print(gen_text_len)
        tmp_input_log_prob.append( input_log_prob/(gen_text_len**0.1) )
    # gen_text_len = self.input_ids_to_gen_text_len(inputs_ids , starting_batch_input_len , extra_added_paddings) #shape(batch_size , 1 , num_candidate_beams)
    # tmp_input_log_prob = inputs_log_prob/(gen_text_len**0.1)

    #Calculating the similarity score for each candidate
    for j in range( len( tokenized_responses ) ): #[tok_component1_beam1, tok_component2_beam1, tok_component3_beam1, ..., tok_component1_beam2, tok_component2_beam2 , ...]
        temp_tokenized_responses = tokenized_responses.copy()
        tokenized_response = temp_tokenized_responses.pop(j)
        selection_score = 0
        updating_score = 0
        for index , other_response in enumerate(temp_tokenized_responses):
            if index>=j:
                index+=1
            # other_response_prob = torch.exp( inputs_log_prob[ int((index%self.number_of_components)+i) , 0,  int(index/self.number_of_components) ] )
            selection_score_with_other_response = tmp_input_log_prob[ index%NUMBER_of_components ][ 0,  int(index/NUMBER_of_components) ] + similarity_meassure( tokenized_response, other_response, in_table_keywords )
            updating_score_with_other_response = inputs_log_prob[ index%NUMBER_of_components ][ 0,  int(index/NUMBER_of_components) ] + similarity_meassure( tokenized_response, other_response, in_table_keywords )
            # score += self.similarity_meassure( tokenized_response, other_response ) * other_response_prob
            # print( 'selection_score_with_other_response: ' , selection_score_with_other_response )
            # print('tmp_input_log_prob: ' , tmp_input_log_prob[ index%NUMBER_of_components ][ 0,  int(index/NUMBER_of_components) ])
            # print('similarity_meassure: ' , similarity_meassure( tokenized_response, other_response, in_table_keywords ))
            # print('selection_score: ' , selection_score)
            if selection_score == 0:
                selection_score = selection_score_with_other_response
                updating_score = updating_score_with_other_response
            else:
                # print(selection_score_with_other_response.dtype) torch.float32
                # print(selection_score.dtype) torch.float32
                alpha = max(selection_score_with_other_response , selection_score)
                beta = min(selection_score_with_other_response , selection_score)
                selection_score = alpha + torch.log1p(torch.exp(beta-alpha))
                
                alpha = max(updating_score_with_other_response , updating_score)
                beta = min(updating_score_with_other_response , updating_score)
                updating_score = alpha + torch.log1p(torch.exp(beta-alpha))
        # log_score = torch.log( score )#/ len(temp_tokenized_responses) )
        selection_log_score = (selection_score + tmp_input_log_prob[ j%NUMBER_of_components] [ 0,  int(j/NUMBER_of_components) ] )#/2
        updating_log_score = (updating_score + inputs_log_prob[ j%NUMBER_of_components][ 0,  int(j/NUMBER_of_components) ] )#/2
        # print(f'toknes:{tokenized_response} point:{score}')
        selection_score_list.append(selection_log_score)
        updating_score_list.append(updating_log_score)
    

    for group in grouped_indices:
        is_first_item = True
        for index in group:
            if is_first_item ==False:
                selection_score_list[index] = -100000
            else:
                is_first_item = False
    # print('\nAfter grouping:\n')
    # for j in range( len( tokenized_responses ) ): 
    #     print(f'toknes:{tokenized_responses[j]} point:{selection_score_list[j]}')

    selected_candidate_list = [] #containing tuples like (component_index , beam_index)
    #Finding the sequence with the highest bleu score.
    tmp_score_list = selection_score_list.copy()
    max_number_of_selections = 5
    for beam in range(max_number_of_selections):
        max_bleu_score_value = max( tmp_score_list )
        max_index_bleu_score = selection_score_list.index(max_bleu_score_value)
        tmp_score_list[ max_index_bleu_score ] = -100000
        selected_candidate_list.append( ( int(max_index_bleu_score%NUMBER_of_components) , int(max_index_bleu_score/NUMBER_of_components) ) )
    # print('selected_candidate_list: ' , selected_candidate_list)
    for beam_index in range(len(selected_candidate_list)):
        start_of_selected_gen_text = starting_batch_input_len[selected_candidate_list[beam_index][0]]
        ensembled_inputs_ids.append( inputs_ids[selected_candidate_list[beam_index][0]][ start_of_selected_gen_text: ,
                                                                                                selected_candidate_list[beam_index][1] ] )
        # print(f'orig log prob: {inputs_log_prob[selected_candidate_list[beam_index][0]+i, 0, selected_candidate_list[beam_index][1]]}')
        # print(f'ensemble prob: {score_list[self.number_of_components * selected_candidate_list[beam_index][1] + selected_candidate_list[beam_index][0]]}')
        # ensembled_inputs_log_prob[i+component_index , 0 , beam_index] = ( inputs_log_prob[selected_candidate_list[beam_index][0]+i,
        #                                                                 0, selected_candidate_list[beam_index][1]] + score_list[self.number_of_components * selected_candidate_list[beam_index][1] + selected_candidate_list[beam_index][0]] )/2
        ensembled_inputs_log_prob.append( updating_score_list[NUMBER_of_components * selected_candidate_list[beam_index][1] + selected_candidate_list[beam_index][0]] )
    # print('extra_added_paddings after ensemble: ' , extra_added_paddings)
    return ensembled_inputs_ids , ensembled_inputs_log_prob

#To use the above algorithm we need the following things: input_ids, inputs_log_prob, starting_batch_input_len, batch_text


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [None]:
import pickle as pkl
prefix = './MBRoutput_sequences_total-4'
post_fixes = ['_batch_text' , '_input_ids' , '_inputs_log_prob' , '_starting_batch_input_len' ]
lists_of_parameter_list = [ [],[],[],[] ]
#reading the output files to get the parameter to perform ensemble.
for index, post_fix in enumerate(post_fixes):
    filename = prefix + post_fix + '.pkl'
    with open(filename , 'rb') as f:
        lists_of_parameter_list[index] = pkl.load(f)
final_answers = [[], [], [], [], []]
counter = 0
for batch_text, inputs_ids, inputs_log_prob, starting_batch_input_len in zip(lists_of_parameter_list[0], lists_of_parameter_list[1], lists_of_parameter_list[2], lists_of_parameter_list[3]):
    print(counter)
    counter += 1
    ensembled_inputs_ids , ensembled_inputs_log_prob = ensemble(inputs_ids, inputs_log_prob, starting_batch_input_len, batch_text )
    ensembled_inputs_text = TOKENIZER.batch_decode( ensembled_inputs_ids ,
                                        skip_special_tokens=True )
    for i, answer_list in enumerate(final_answers):
        answer_list.append(ensembled_inputs_text[i])
    # final_answers.append(ensembled_inputs_text[0])
    print('ensembled_inputs_ids: ' , ensembled_inputs_text[0])
    print('ensembled_inputs_log_prob: ' , ensembled_inputs_log_prob[0])
    print('----------------------------------------------------------')

final_output_filenames = [ './MBRoutput_sequences_total-4_rank1.pkl', './MBRoutput_sequences_total-4_rank2.pkl', './MBRoutput_sequences_total-4_rank3.pkl',
                         './MBRoutput_sequences_total-4_rank4.pkl', './MBRoutput_sequences_total-4_rank5.pkl']
for i, file_name in enumerate(final_output_filenames):
    with open(file_name , 'wb')as f:
        pkl.dump(final_answers[i] , f)

    

  return torch.load(io.BytesIO(b))


0
ensembled_inputs_ids:  count(*) FROM singer


ensembled_inputs_log_prob:  tensor(0.4538, device='cuda:0')
----------------------------------------------------------
1
ensembled_inputs_ids:  count(*) FROM singer


ensembled_inputs_log_prob:  tensor(0.3196, device='cuda:0')
----------------------------------------------------------
2
ensembled_inputs_ids:  Name, Country, Age FROM singer ORDER BY Age ASC


ensembled_inputs_log_prob:  tensor(-4.5062, device='cuda:0')
----------------------------------------------------------
3
ensembled_inputs_ids:  Name, Country, Age FROM singer ORDER BY Age DESC


ensembled_inputs_log_prob:  tensor(-2.9997, device='cuda:0')
----------------------------------------------------------
4
ensembled_inputs_ids:  avg(Age), min(Age), max(Age) FROM singer WHERE Country = 'France'


ensembled_inputs_log_prob:  tensor(-6.8246, device='cuda:0')
----------------------------------------------------------
5
ensembled_inputs_ids:  avg(age) ,  min(age) ,  max(age) FROM

In [3]:
prefixes = ['./outputs2_0_3/output_sequences_' , './outputs2_6_9/output_sequences_', './outputs4_12_15/output_sequences_',
           './outputs4_3_6/output_sequences_' , './outputs4_9_12/output_sequences_']
post_fixes = ['_batch_text' , '_input_ids' , '_inputs_log_prob' , '_starting_batch_input_len' ]
#What do we need for the ensemble?
#for every item, lets have a list of lists.
for post_fix in post_fixes:
    components_list_of_outputs = [ [], [], [], [], [] ]
    for index , prefix in enumerate(prefixes):
        filename = prefix + 'total-4' + post_fix + '.pkl'
        with open(filename , 'rb') as f:
            list_of_outputs = pkl.load(f)
        if post_fix== '_starting_batch_input_len':
            print(len(list_of_outputs))
            new_list_of_outputs = [ele for ele in list_of_outputs[:200] for i in range( 5 )]
            new_list_of_outputs.extend([ele for ele in list_of_outputs[200:-2] for i in range( 3 )])
            new_list_of_outputs.extend([list_of_outputs[-2] for i in range( 5 )])
            new_list_of_outputs.extend([list_of_outputs[-1] for i in range( 2 )])
            list_of_outputs = new_list_of_outputs
            print(len(list_of_outputs))
        components_list_of_outputs[index] = list_of_outputs
    postfix_total_list = []
    for a, b, c, d, e in zip (components_list_of_outputs[0], components_list_of_outputs[1], components_list_of_outputs[2], components_list_of_outputs[3], components_list_of_outputs[4]):
        postfix_total_list.append([a,b,c,d,e])
    new_prefix = './MBRoutput_sequences_total-4'
    output_filename = new_prefix + post_fix + '.pkl'
    with open(output_filename , 'wb')as f:
        pkl.dump(postfix_total_list , f)
    
        

211
1034
211
1034
211
1034
211
1034
211
1034


In [2]:
#grouping the pieces of output_sequences
import pickle as pkl
prefix = './outputs2_0_3/output_sequences_'
post_fixes = ['_batch_text' , '_input_ids' , '_inputs_log_prob' , '_starting_batch_input_len' ]
# post_fixes = ['_input_ids' , '_inputs_log_prob']
for post_fix in post_fixes:
    output_sequences = []
    for i in range(0 , 1050 , 50):
        if i <1000:
            if i == 0: 
                file_name = prefix + '0_'  + str(i+50).lstrip('0') + '-4' + post_fix + '.pkl'
            else:
                file_name = prefix + str(i).lstrip('0') + '_' + str(i+50).lstrip('0') + '-4' + post_fix + '.pkl'
        else:
            file_name = prefix + str(i).lstrip('0') + '_end' + '-4' + post_fix + '.pkl'
        # print(file_name)
        with open(file_name , 'rb') as f:
            part_of_output = pkl.load(f)
        output_sequences.extend(part_of_output)
    with open(prefix + 'total-4' + post_fix + '.pkl' , 'wb')as h:
        pkl.dump(output_sequences , h)
            

  return torch.load(io.BytesIO(b))
