In [79]:
#Code for sequence-level-voting ensemble using my designed scoring mechanism
import torch
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
from collections import defaultdict , Counter
import numpy as np
import math
import CodeS.SQLtree_based_similarity as sql_scoring
import copy
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize, RegexpTokenizer
import nltk
import re

# MODEL_NAME = 'huggyllama/llama-7b'
# MODEL_NAME = 'seeklhy/codes-1b'
MODEL_NAME = 'seeklhy/codes-7b-bird-with-evidence'
# MODEL_NAME = 'Qwen/Qwen2.5-Coder-7B-Instruct'
# MODEL_NAME = 'Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8'


NUMBER_of_components = int(8) #Don't Forget to set this peoperly!!!!
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
# TOKENIZER.add_special_tokens({'pad_token': '[PAD]'})
# TOKENIZER.padding_side="left"
# LENGTH_PEN = 0.1
LENGTH_PEN = 1

def group_tokenized_responses( tokenized_responses, inputs_log_prob):
        # Dictionary to group indices of identical tokenized responses
        groups = defaultdict(list)

        # Iterate over the tokenized responses with their indices
        for i, tokens in enumerate(tokenized_responses):
            # Convert tokens to a tuple (hashable) to use as a dictionary key
            token_bag_with_counts = tuple(sorted(Counter(tokens).items()))
            groups[token_bag_with_counts].append(i)

        # print('groups.keys(): ', groups.keys())
        # print('groups: ' , groups)
        # Convert the grouped dictionary values into a list of groups
        grouped_indices = list(groups.values())

        # Calculate the average log probabilities for each group and assign them
        for group in grouped_indices:
            # Extract log probabilities for the current group
            group_probs = [
                inputs_log_prob[int(idx % NUMBER_of_components)][ 0, int(idx / NUMBER_of_components)].item()
                for idx in group
            ]

            # Calculate the average probability for the group
            avg_prob = np.mean(group_probs)

            # Assign the average probability back to each index in the group
            for idx in group:
                inputs_log_prob[int(idx % NUMBER_of_components)][ 0, int(idx / NUMBER_of_components)] = avg_prob

        return grouped_indices, inputs_log_prob

def custom_tokenize( text, must_have_textnum=True):
    # Regular expression pattern to match quoted text
    quote_pattern = r'\".*?\"|\`.*?\`|\'.*?\''
    
    # Find all quoted texts
    quoted_texts = nltk.re.findall(quote_pattern, text)

    # Replace quoted texts with placeholders to avoid re-tokenization
    placeholder = "QUOTE_PLACE_HOLDER"
    modified_text = nltk.re.sub(quote_pattern, placeholder, text)

    # Tokenize the modified text using word_tokenize
    tokens = word_tokenize(modified_text)

    # Replace placeholders with the original quoted texts
    final_tokens = []
    quote_index = 0
    for token in tokens:
        if token == placeholder:
            final_tokens.append(quoted_texts[quote_index])
            quote_index += 1
        else:
            final_tokens.append(token)

    return [s for s in final_tokens if re.search(r'[a-zA-Z0-9]', s)] if must_have_textnum else final_tokens

#calculating the similarity score of the two tokenized sentences.
def similarity_meassure( hypothesis_tokens , reference_tokens, in_table_keywords):
    # hypothesis_tokens = [token for token in hypothesis_tokens if token.upper() not in self.sql_keywords]
    # reference_tokens = [token for token in reference_tokens if token.upper() not in self.sql_keywords]

    hypothesis_counts = Counter(hypothesis_tokens)
    reference_counts = Counter(reference_tokens)
    clipped_counts = dict()

    for token in hypothesis_tokens:
        if token not in reference_counts.keys():
            reference_counts[token] = 0
        if token in in_table_keywords:
            clipped_counts[token] = min(hypothesis_counts[token], reference_counts[token])
            # print(f'{token}')
        else:
            clipped_counts[token] = min(hypothesis_counts[token], reference_counts[token])/2
    
    total_clipped = sum(clipped_counts.values())

    similarity = (total_clipped*2+0.1) / ( len(hypothesis_tokens) + len(reference_tokens) + 0.1 )

    return similarity

def decode_tokenize_apply_lenpen(inputs_ids, inputs_log_prob, starting_batch_input_len):
    tokenized_responses = []
    decoded_text_list = []
    tmp_input_log_prob = []
    #Tokenizing the candidate sequences
    for comp_input_ids,comp_starting_tok in zip(inputs_ids,starting_batch_input_len):
        decoded_text_list.extend( TOKENIZER.batch_decode( comp_input_ids[ comp_starting_tok: , : ].transpose(0,1) ,
                                    skip_special_tokens=True ) )
    decoded_text_list_1 = [] # This is for SPIDER dataset and llama model
    for text in decoded_text_list: #number of candidates we have for each question
        # print(text)
        text = post_process_get_sql_from_gentext(text) # This is for SPIDER dataset and llama model
        decoded_text_list_1.append(text)
        tokenized_responses.append( custom_tokenize( text.replace('.' , ' ')))#, must_have_textnum=False ) )
    decoded_text_list = decoded_text_list_1 # This is for SPIDER dataset and llama model

    #Finding the identical candidates, take the average of their probability, and only keep one of them with the average probability assigned to it.
    # grouped_indices, inputs_log_prob = group_tokenized_responses( tokenized_responses, inputs_log_prob)
    for comp_inputs_log_prob, comp_input_ids, comp_starting_tok in zip(inputs_log_prob, inputs_ids, starting_batch_input_len):
        gen_text_len = comp_input_ids.size(0) - comp_starting_tok
        # print('comp_input_ids.size(0): ' , comp_input_ids.size(0))
        # print('comp_starting_tok: ' , comp_starting_tok)
        # print(comp_inputs_log_prob)
        # print((gen_text_len** LENGTH_PEN ))
        tmp_input_log_prob.append(comp_inputs_log_prob/(gen_text_len** LENGTH_PEN ))
    return tokenized_responses, decoded_text_list, tmp_input_log_prob

from utils.post_process import process_duplication
def post_process_get_sql_from_gentext(gen_text):
    # remove \n and extra spaces
    # print(gen_text)
    sql = " ".join(gen_text.replace("\n", " ").split())
    sql = process_duplication(sql)
    # python version should >= 3.8
    if sql.startswith("SELECT"):
        sql = sql
    elif sql.startswith(" "):
        sql = "SELECT" + sql
    else:
        sql = "SELECT " + sql
    return sql

def scale_to_0_and_1(arr):
    # Min-max scaling
    arr_min = np.min(arr)
    arr_max = np.max(arr)
    scaled = (arr - arr_min) / (arr_max - arr_min)
    return scaled
    
def ensemble( inputs_ids , inputs_log_prob , batch_text , starting_batch_input_len=None, similarity_func='avg' , keep_all=False, comp_prob=None ):# , past_key_value_tensor ):
    #function for performing ensemble using the bleu metric between the candidate sequences.
    #input:
        #inputs_ids: torch tensor representing the prompt tokens per component with shape (batch_size , input_len , num_beam)
        #inputs_log_prob: torch tensor representing the probability of each input_ids with the shape(batch_size , 1 , num_beam)
        #starting_batch_input_len: The length of the batch before any predictions.
        #extra_added_paddings: torch tensor with size (batch_size , 1 , num_beam) recording the token index of the last token of the prompt (where the answer starts)
        #batch_text: list of string with len()=number_components. Having the prompts for each components.
    #return:
        #ensembled_inputs_ids: torch tensor with size (batch_size , input_len , num_beam)
        #ensembled_inputs_log_prob: torch tensor with size (batch_size , 1 , num_beam)
    
    if comp_prob is None:
        comp_prob = torch.tensor([0 for i in range(len(inputs_ids)) ])
    batch_size = len(inputs_ids)
    num_beam = inputs_ids[0].size( dim=-1 ) if starting_batch_input_len is not None else len(inputs_ids[0])
    

    table_creation_part_prompt = batch_text[0].split('database schema :')[-1] # this is for BIRD
    # table_creation_part_prompt = batch_text[0].split('Given the following database schema:')[-1].split('Answer the following')[0]#This is for SPIDER
    
    in_table_keywords = custom_tokenize( table_creation_part_prompt.replace('.' , ' ')  )
    # print('in_table_keywords: ' , in_table_keywords)
    selection_score_list = [] #This scoring is used to select the best candidates. It uses the length penalty to calculate the scores

    if starting_batch_input_len is None:
        decoded_text_list = []
        tokenized_responses = []
        tmp_input_log_prob = inputs_log_prob
        for comp_beams in inputs_ids:
            post_processed_beams = []
            for beam in comp_beams:
                post_processed_beams.append(post_process_get_sql_from_gentext(beam))
            decoded_text_list.extend(post_processed_beams)
        for text in decoded_text_list: #number of candidates we have for each question
            tokenized_responses.append( custom_tokenize( text.replace('.' , ' ')))#, must_have_textnum=False ) )
        
    else:
        tokenized_responses, decoded_text_list, tmp_input_log_prob = decode_tokenize_apply_lenpen(inputs_ids, inputs_log_prob, starting_batch_input_len)

    # print(f'tmp_input_log_prob: {tmp_input_log_prob}')
    similarity_matrix = []
    prob_matrix = []
    prob_vector = []
    #Calculating the score for each candidate
    for j in range( len( tokenized_responses ) ): #[tok_component1_beam1, tok_component2_beam1, tok_component3_beam1, ..., tok_component1_beam2, tok_component2_beam2 , ...]
        temp_tokenized_responses = copy.deepcopy( tokenized_responses )
        temp_query_list = decoded_text_list.copy()
        query = temp_query_list.pop(j)
        tokenized_response = temp_tokenized_responses.pop(j)
        selection_score = 0
        selection_denominator = 0
        similarity_list = []
        prob_list = []
        for index , (other_tokenized_response, other_query) in enumerate( zip( temp_tokenized_responses , temp_query_list ) ):
            if index>=j:
                index+=1
            
            if query=='':
                query='***weirdanswer***'
            elif other_query=='':
                other_query='***weirdanswer***'
            
            jacc_weighted_sim = similarity_meassure( tokenized_response, other_tokenized_response, in_table_keywords )
            
            if similarity_func == 'jacc':
                similarity = math.log(jacc_weighted_sim)
            elif similarity_func == 'tree':
                tree_based_sim = sql_scoring.unparsed_query_similarity(query , other_query)
                if tree_based_sim == 0:
                    similarity = math.log(jacc_weighted_sim)
                else:
                    similarity = math.log( tree_based_sim )
            elif similarity_func == 'bleu':
                blue_score = sentence_bleu( word_tokenize( query.replace('.' , ' ') ) , word_tokenize( other_query.replace('.' , ' ') ) , smoothing_function = SmoothingFunction().method1)
                if blue_score == 0:
                    similarity = math.log(jacc_weighted_sim)
                else:
                    similarity = math.log( blue_score )
            # elif similarity_func=='avg':
            #     if tree_based_sim == 0:
            #         similarity = math.log(jacc_weighted_sim)
            #     else:
            #         similarity = math.log( (jacc_weighted_sim + tree_based_sim)/2 )
            similarity_list.append(similarity)
            prob_with_lenpen = tmp_input_log_prob[ int(index/num_beam)][ int(index%num_beam) ] + comp_prob[ int(index/num_beam) ]
            prob_list.append(prob_with_lenpen)

        prob_vector.append(tmp_input_log_prob[ int(j/num_beam) ] [ int(j%num_beam) ] + comp_prob[ int(j/num_beam) ])
        similarity_matrix.append(similarity_list)
        prob_matrix.append(prob_list)
        
    # print('similarity_matrix: ' , similarity_matrix)
    scaled_similarity = scale_to_0_and_1( np.array(similarity_matrix) )
    # print('scaled_similarity: ' , scaled_similarity)
    sim_prob_matrix = np.array(prob_matrix) + scaled_similarity
    # print('sim_prob_matrix: ' , sim_prob_matrix)
    scaled_sim_prob = scale_to_0_and_1( sim_prob_matrix )
    # print('scaled_sim_prob: ' , scaled_sim_prob)
    scaled_prob_vector = scale_to_0_and_1( np.array(prob_vector) )

    selection_score=None
    for i in range(scaled_sim_prob.shape[1]):
        if selection_score is None:
            selection_score = scaled_sim_prob[:,0:1]
        else:
            new_comparison_vector = np.concatenate( (selection_score , scaled_sim_prob[:,i:i+1]) , axis=1 )
            alpha = torch.from_numpy( np.max( new_comparison_vector , axis=1 ) ).unsqueeze(1)
            beta = torch.from_numpy( np.min( new_comparison_vector , axis=1 ) ).unsqueeze(1)
            selection_score = alpha + torch.log1p( torch.exp(beta-alpha) )

    scaled_selection_score = scale_to_0_and_1( selection_score.detach().cpu().numpy() )
    
    # print( 'scaled_selection_score.shape: ' , scaled_selection_score.shape )
    # print( 'scaled_prob_vector.shape: ' , scaled_prob_vector.shape )
    
    final_score = ( scaled_selection_score + np.expand_dims( scaled_prob_vector, 1 ) )
    # print('final_score: ', final_score )
        
    selection_score_list = final_score.flatten().tolist()
    
    if keep_all==False:
        #Ignoring all identical candidates and only keeping one.
        for group in grouped_indices:
            is_first_item = True
            for index in group:
                if is_first_item ==False:
                    selection_score_list[index] = torch.finfo(torch.float32).min
                    # selection_score_list[index] = -100000
                else:
                    is_first_item = False

    output_queries_in_order = [] #containing tuples like (component_index , beam_index)
    #Finding the sequence with the highest bleu score.
    tmp_score_list = selection_score_list.copy()
    if keep_all == True:
        num_sel_candid = NUMBER_of_components * num_beam
    else:
        num_sel_candid = num_beam
    for beam in range(num_sel_candid):
        max_bleu_score_value = max( tmp_score_list )
        # print('max_bleu_score_value: ', max_bleu_score_value)
        # print('tmp_score_list: ' , tmp_score_list)
        max_index_bleu_score = tmp_score_list.index(max_bleu_score_value)
        # print(torch.finfo(torch.float32).min)
        tmp_score_list[ max_index_bleu_score ] = torch.finfo(torch.float32).min
        # tmp_score_list[ max_index_bleu_score ] = -100000
        output_queries_in_order.append(  decoded_text_list[ max_index_bleu_score ]  )
        
    return output_queries_in_order

#To use the above algorithm we need the following things: input_ids, inputs_log_prob, starting_batch_input_len, batch_text


In [None]:
#Performing ensemble on the list of lists that are provided from the cell below
import pickle as pkl
import math
prefix = './MBRoutput_sequences_total-4'

#This is for the time when we have input_ids and _starting_batch_input_len:
# post_fixes = ['_batch_text' , '_input_ids' , '_inputs_log_prob' , '_starting_batch_input_len' ]
# lists_of_parameter_list = [ [],[],[],[] ]

##This is for the time when we only have the strings and the probabilities are already with length penalty
post_fixes = ['_batch_text' , '_a1b1Joint6comp_beams_log_prob' , '']
# post_fixes = ['_batch_text' , '_input_ids' , '']
lists_of_parameter_list = [ [],[],[] ]
comp_prob_filenames = ['oracle_comp_prob_per_sample_2.pkl' , 'oracle_comp_prob_per_sample_2.pkl' , 'oracle_comp_prob_per_sample_divpen05_1.pkl',
                       'oracle_comp_prob_per_sample_divpen05_2.pkl' , 'oracle_comp_prob_per_sample_divpen1_1.pkl', 'oracle_comp_prob_per_sample_divpen1_2.pkl',
                      'oracle_comp_prob_per_sample_divpen2_1.pkl', 'oracle_comp_prob_per_sample_divpen2_2.pkl'] #This is for component probability
comp_prob_list = [  ] # This is for the component probability

#reading the output files to get the parameter to perform ensemble.
for index, post_fix in enumerate(post_fixes):
    filename = prefix + post_fix + '.pkl'
    with open(filename , 'rb') as f:
        lists_of_parameter_list[index] = pkl.load(f)
        
for i , filename in enumerate(comp_prob_filenames): #This is for the component probability
    with open(filename , 'rb') as f:
        comp_prob_list.append(pkl.load(f))

ordered_final_answers_list = []
for sample_idx in range( len(lists_of_parameter_list[0]) ):
    print(sample_idx)
    if len(lists_of_parameter_list) ==3:
        batch_text = lists_of_parameter_list[0][sample_idx]
        inputs_log_prob = lists_of_parameter_list[1][sample_idx]
        output_text = lists_of_parameter_list[2][sample_idx]
        comps_probs = torch.tensor([ math.log(comp_prob_list[i][sample_idx]) for i in range(len(comp_prob_list))]) #This is for the component probability
        ordered_final_answers = ensemble(output_text, inputs_log_prob, batch_text, similarity_func='tree' , keep_all=True, comp_prob=None)#comps_probs )
    else:
        batch_text = lists_of_parameter_list[0][sample_idx]
        inputs_ids = lists_of_parameter_list[1][sample_idx]
        inputs_log_prob = lists_of_parameter_list[2][sample_idx]
        starting_batch_input_len = lists_of_parameter_list[3][sample_idx]
        comps_probs = torch.tensor([ math.log(comp_prob_list[i][sample_idx]) for i in range(len(comp_prob_list))]) #This is for the component probability
        ordered_final_answers = ensemble(inputs_ids, inputs_log_prob, batch_text,
                                         starting_batch_input_len, similarity_func='tree' , keep_all=True, comp_prob=comps_probs )
    ordered_final_answers_list.extend(ordered_final_answers)
    # break

final_output_filenames = 'output_1shot_diffDivpen_8in0-2shotComps_beam4_seq_mine_scaled_pureSeqlevelTreebased_SFTCodeS-7b_a1b1jointMargin6compProb.pkl'
with open(final_output_filenames , 'wb')as f:
    pkl.dump(ordered_final_answers_list , f)
print('len(ordered_final_answers_list): ' , len(ordered_final_answers_list))
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [25]:
#Creating files containing the outputs of the components. Each file contains the one type of output: '_batch_text' , '_input_ids' , '_inputs_log_prob' , '_starting_batch_input_len'
#Each file is a list of lists: [ [output0_comp0, output0comp1, ..., output0comp5] , [output1_comp0, output1comp1, ..., output1comp5] ...  ]
import pickle as pkl
#This is for the time when we have input_ids and _starting_batch_input_len:
prefixes = ['./vulcan_output/output_CodeS_beam4_bird_wEvidence_1shot_SFTCodeS-7b_2ndOf0-5shot' , './vulcan_output/output_CodeS_beam4_bird_wEvidence_1shot_SFTCodeS-7b_3rdOf0-5shot', './vulcan_output/output_CodeS_beam4_divpen05_bird_wEvidence_1shot_SFTCodeS-7b_1stOf0-5shot',
           './vulcan_output/output_CodeS_beam4_divpen05_bird_wEvidence_1shot_SFTCodeS-7b_2ndOf0-5shot' , './vulcan_output/output_CodeS_beam4_divpen1_bird_wEvidence_1shot_SFTCodeS-7b_1stOf0-5shot', './vulcan_output/output_CodeS_beam4_divpen1_bird_wEvidence_1shot_SFTCodeS-7b_2ndOf0-5shot',
           './vulcan_output/output_CodeS_beam4_divpen2_bird_wEvidence_1shot_SFTCodeS-7b_1stOf0-5shot', './vulcan_output/output_CodeS_beam4_divpen2_bird_wEvidence_1shot_SFTCodeS-7b_2ndOf0-5shot']
# prefixes = ['./outputs2_0_3/output_sequences_total-4' , './outputs4_3_6/output_sequences_total-4', './outputs2_6_9/output_sequences_total-4',
#            './outputs4_9_12/output_sequences_total-4' , './outputs4_12_15/output_sequences_total-4']
# post_fixes = ['_batch_text' , '_input_ids' , '_inputs_log_prob' , '_starting_batch_input_len' ]
# post_fix_candidate_range = [1,1,1,1]

##This is for the time when we only have the strings and the probabilities are already with length penalty
# prefixes = ['./output_sequences_bird_lenpen05_0-5shot_Qwen7b' , './output_sequences_bird_lenpen05_5-10shot_Qwen7b', './output_sequences_bird_lenpen05_10-15shot_Qwen7b',
#            './output_sequences_bird_lenpen05_15-20shot_Qwen7b' , './output_sequences_bird_lenpen05_20-25shot_Qwen7b']
post_fixes = ['_batch_text' , '_a1b2Joint6comp_beams_log_prob' , '']
# post_fixes = ['_batch_text' , '_input_ids' , ''] # Its only for components in the same calss with "output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b"

post_fix_candidate_range = [1,4,4]

#What do we need for the ensemble?
#for every item, lets have a list of lists.
for post_fix, post_fix_range in zip(post_fixes, post_fix_candidate_range):
    components_list_of_outputs = [ [] for i in prefixes ]
    for index , prefix in enumerate(prefixes):
        if post_fix == '_batch_text':
            prefix = 'output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b'
        filename = prefix  + post_fix + '.pkl'
        with open(filename , 'rb') as f:
            list_of_outputs = pkl.load(f)
            print(f'{filename}: len={len(list_of_outputs)}')
        # #This might be the code to address the bug happend on handling _starting_batch_input_len for SPIDER dataset with llama.
        # if post_fix== '_starting_batch_input_len':
        #     print(len(list_of_outputs))
        #     new_list_of_outputs = [ele for ele in list_of_outputs[:200] for i in range( 5 )]
        #     new_list_of_outputs.extend([ele for ele in list_of_outputs[200:-2] for i in range( 3 )])
        #     new_list_of_outputs.extend([list_of_outputs[-2] for i in range( 5 )])
        #     new_list_of_outputs.extend([list_of_outputs[-1] for i in range( 2 )])
        #     list_of_outputs = new_list_of_outputs
        #     print(len(list_of_outputs))
            
        components_list_of_outputs[index] = list_of_outputs
    postfix_total_list = []
    for sample_idx in range(0, len(components_list_of_outputs[0]), post_fix_range ):
        if post_fix_range>1:
            sample_outputs = [ components_list_of_outputs[comp][sample_idx:sample_idx+post_fix_range] for comp in range(len(components_list_of_outputs)) ]
        else:
            sample_outputs = [ components_list_of_outputs[comp][sample_idx] for comp in range(len(components_list_of_outputs)) ]
        postfix_total_list.append(sample_outputs)
    # for a, b, c, d, e in zip (components_list_of_outputs[0], components_list_of_outputs[1], components_list_of_outputs[2], components_list_of_outputs[3], components_list_of_outputs[4]):
    #     postfix_total_list.append([a,b,c,d,e])
    new_prefix = './MBRoutput_sequences_total-4'
    output_filename = new_prefix + post_fix + '.pkl'
    print(f'{output_filename}: len={len(postfix_total_list)}')
    with open(output_filename , 'wb')as f:
        pkl.dump(postfix_total_list , f)
    
        

output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b_batch_text.pkl: len=1534
output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b_batch_text.pkl: len=1534
output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b_batch_text.pkl: len=1534
output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b_batch_text.pkl: len=1534
output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b_batch_text.pkl: len=1534
output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b_batch_text.pkl: len=1534
output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b_batch_text.pkl: len=1534
output_codeS_beam5_bird_lenpen05_0-5shot_OneSQLQwen-32b_batch_text.pkl: len=1534
./MBRoutput_sequences_total-4_batch_text.pkl: len=1534
./vulcan_output/output_CodeS_beam4_bird_wEvidence_1shot_SFTCodeS-7b_2ndOf0-5shot_a1b2Joint6comp_beams_log_prob.pkl: len=6136
./vulcan_output/output_CodeS_beam4_bird_wEvidence_1shot_SFTCodeS-7b_3rdOf0-5shot_a1b2Joint6comp_beams_log_prob.pkl: len=6136
./vulcan_output/output_CodeS_beam4_divpen05_bir

In [2]:
#grouping the pieces of output_sequences
import pickle as pkl
prefix = './outputs2_0_3/output_sequences_'
post_fixes = ['_batch_text' , '_input_ids' , '_inputs_log_prob' , '_starting_batch_input_len' ]
# post_fixes = ['_input_ids' , '_inputs_log_prob']
for post_fix in post_fixes:
    output_sequences = []
    for i in range(0 , 1050 , 50):
        if i <1000:
            if i == 0: 
                file_name = prefix + '0_'  + str(i+50).lstrip('0') + '-4' + post_fix + '.pkl'
            else:
                file_name = prefix + str(i).lstrip('0') + '_' + str(i+50).lstrip('0') + '-4' + post_fix + '.pkl'
        else:
            file_name = prefix + str(i).lstrip('0') + '_end' + '-4' + post_fix + '.pkl'
        # print(file_name)
        with open(file_name , 'rb') as f:
            part_of_output = pkl.load(f)
        output_sequences.extend(part_of_output)
    with open(prefix + 'total-4' + post_fix + '.pkl' , 'wb')as h:
        pkl.dump(output_sequences , h)
            

  return torch.load(io.BytesIO(b))
