In [35]:
import torch
from transformers import AutoModelForCausalLM , AutoTokenizer
import torch.nn.functional as F
import os.path
import pickle as pkl
import json
import numpy as np
from threading import Thread

class LLM_Word_Level_Ensemble:

    def __init__(self, model_name , device , tokenizer ):
        # Initialize the model and the tokenizer.
        self.device = device
        # self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        # self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = model_name
        self.tokenizer = tokenizer
    
    def get_predictions(self, sentences):
        # Encode the sentence using the tokenizer and return the model predictions.
        self.tokenizer.pad_token = tokenizer.eos_token
        self.tokenizer.padding_side="left"
        inputs = self.tokenizer(sentences, return_tensors="pt" , padding=True).input_ids.to(self.device)
        print('inputs: ' , inputs)
        print('type(inputs): ' , type(inputs))
        print('inputs.size(): ' , inputs.size())
        with torch.no_grad():
            outputs = self.model(inputs)
            predictions = outputs[0]
        return predictions

    def get_the_top_word_from_probability_vector(self , prob_vector , top_k = 1):
        #Gets the top-k words from a probability matrix and returns them.
        #Input:
            #prob_vector: vector with probability for each word. shape(batch_size , vocab_size)
        #Retrun:
            #numpy array of size (top_k , batch_size)
        
        topk_candidates_indexes = torch.topk( prob_vector, top_k ).indices #shape (batch_size , top_k)

        print('topk_candidates_indexes: ' , topk_candidates_indexes)
        # Decode the top k candidates back to words.
        topk_candidates_tokens = tokenizer.batch_decode(topk_candidates_indexes)
        if top_k>1:
            topk_candidates_seperated_tokens = [words.split() for words in topk_candidates_tokens] #this is for when top_k>1
        else:
            topk_candidates_seperated_tokens = topk_candidates_tokens
        return np.array(topk_candidates_seperated_tokens )
        
    def get_next_word_probabilities(self, sentences, top_k=-1):#operation_on_a_batch
        # Getting the word probabilities for the next word. It will have the probability for the top-k words, and it will zero out the rest. If the top_k argument is set to -1 then it will return the whole probability distribution.
        #Input:
            #sentences: list of strings. The batch that we want to process.
            #top_k: integer. Number of words with the highest probability that we want to have their probability back in the returning vecotr
        #Retrun:
            #torch tensor. The vector containing the probability of top_k words for the next word. The rest of the words are zeroed out. shape(batch_size , vocab_size)
        # Get the model predictions for the sentence.
        predictions = self.get_predictions(sentences) #shape [batch_size , sentence_length , vocab_size]
        
        # Get the next token candidates.
        next_token_candidates_tensor = predictions[:, -1, :]
        
        # Get the token probabilities for all candidates.
        all_candidates_probabilities = torch.nn.functional.softmax(
            next_token_candidates_tensor, dim=-1)
        if top_k<1:
            return all_candidates_probabilities
            
        # Get the top k next token candidates.
        topk_candidates_indexes = torch.topk(
            next_token_candidates_tensor, top_k).indices #shape( batch_size , top_k )
        # topk_candidates_indexes= torch.cat( [topk_candidates_indexes,tokenizer.eos_token_id ],dim=0 )
    
        batch_size = topk_candidates_indexes.size(dim=0) # This tensor specifies the index of the sentence in the batch for each top-k word
        batch_indice_tensor = torch.tensor( list(range( batch_size )) ) # shape (batch_size,)
        batch_indice_tensor = torch.reshape(batch_indice_tensor , (batch_size , 1)) #shape(batch_size , 1)
        batch_indice_tensor = batch_indice_tensor.expand( batch_size , top_k ) #shape ( batch_size , top_k )

        masking_matrix = torch.zeros( ( all_candidates_probabilities.size() ) )
        masking_matrix[ batch_indice_tensor , topk_candidates_indexes ] = 1
        
        # Filter the token probabilities for the top k candidates.
        topk_candidates_probabilities = all_candidates_probabilities * masking_matrix
        return topk_candidates_probabilities #shape (batch_size , vocab_size)

    def convert_flatten_index_to_2D_index(self , flatten_index , array_shape ):
        #converting the index in the flatten matrix: datasets_prompts_array.flatten('F')
        #input:
            #flatten_index: integer. index in the flatten matrix
            #array_shape: a tuple. indicating the shape of the 2D array.
        #return:
            #tuple. indicating the indice in the 2D array
        second_dim_index = int(flatten_index/array_shape[0])
        first_dim_index = flatten_index%array_shape[0]
        return first_dim_index , second_dim_index

    # import os.path
    # import pickle as pkl
    # import json
    # import numpy as np
    def load_prompts_from_datasets(self , directory , dataset_list , starting_index=0, ending_index=-1):
        #getting the prompts from the datasets keeping them in a array accordingly, storing the list of prompts in a file for later use. Getting a numpy array ready for LLM for inference in self.all_prompts_in_1D.
        #input:
            #dataset_list: list of string. a list of addresses for the json file datasets.
            #directory: address of the directory that datasets are in. It ends with '/'
            #starting_index: The starting index of prompts that we want to process
            #ending_index: The ending index of prompts that we want to process
        #return:
            #length of the list containing all the prompts
        datasets_prompts_list = []
        for dataset in dataset_list:
            prompt_list = []
            potential_cache_file_name = './ensemble_cache/' + dataset+'.pkl'
            if os.path.isfile(potential_cache_file_name):
                with open(potential_cache_file_name, 'rb') as f:
                    prompt_list = pkl.load(f)
            else:
                dataset_path = directory + dataset + '/' + 'questions.json'
                with open( dataset_path , 'r') as f:
                    dataset_file_byte = f.read()
                    dataset_json_format = json.loads(dataset_file_byte)
                question_units = dataset_json_format['questions']
                for question_unit in question_units:
                    prompt_list.append(question_unit['prompt'])
                with open(potential_cache_file_name, 'wb') as f:
                    pkl.dump( prompt_list , f )
            datasets_prompts_list.append( prompt_list[ starting_index : ending_index ] )
        self.datasets_prompts_array = np.array(datasets_prompts_list , dtype='<U10800' ) #shape (number_of_datasset , number_of_prompts)
        self.all_prompts_in_1D = self.datasets_prompts_array.flatten('F').tolist()
        
        self.number_of_components = self.datasets_prompts_array.shape[0]
        self.number_of_prompts = self.datasets_prompts_array.shape[1]
        self.vocab_size = self.tokenizer.vocab_size
        self.last_inferences = torch.zeros( (self.number_of_components , self.number_of_prompts , self.vocab_size ) ) # shape(number_of_components , number_of_prompts , vocab_size)
        return len(self.all_prompts_in_1D)

    def printPrompt(self , component_index , prompt_index ):
        print(self.datasets_prompts_array[component_index,prompt_index])
        
    def ensemble(self, output_array):
        #function for performing ensemble
        #input:
            #output_array: torch tensor representing the probability of each word in the matrix shape (number_of_components , number_of_ready_indexes_for_ensemble , vocab_size)
        #return:
            #numpy array containing the new probabilities per word for each prompt. shape( number_of_ready_indexes_for_ensemble , vocab_size )
        print('ensemble, output array shape: ' , output_array.size() )
        # return torch.mean(output_array , dim=0 )
        return output_array[-1,:,:]

    
        
    # import torch.nn.functional as F
    def output_handeling(self, batch_index , batch_output):
        # This function handels the output of each batch. It performs post process and keeps the data in self.last_inferences so that it updates self.datasets_prompts_array. In the end of each step of inference on the data: 
        # input:
        #     batch_index: index of the first element of the batch in the self.all_prompts_in_1D
        #     batch_output: the output of operation_on_a_batch returns. a torch tensor shape ( batch_size , vocab_size )
        # return:
        #     nothing (The function will affect self.last_inferences)
        batch_size = batch_output.size(dim=0)
        
        #Determining the location of the first element and the last element of the batch_output in the self.last_inference matrix
        first_elem_fis_dim_index , first_elem_sec_dim_index = self.convert_flatten_index_to_2D_index(batch_index , self.datasets_prompts_array.shape )
        last_elem_fis_dim_index , last_elem_sec_dim_index = self.convert_flatten_index_to_2D_index(batch_index+batch_size-1 , self.datasets_prompts_array.shape )
        
        #Determining how much padding should be done to the begging and end of the batch_output so that we can use vectorization to update self.last_inference
        padding_to_first = first_elem_fis_dim_index
        padding_to_last = self.number_of_components - 1 - last_elem_fis_dim_index

        #Performing padding on the batch_output and the creating the masking matrix to update self.last_inferences
        padded_batch_output = F.pad( input=batch_output, pad=(0, 0, padding_to_first , padding_to_last), mode='constant', value=0) #F.pad(input=source, pad=(0, 1, 1, 1), mode='constant', value=0)
        masking_matrix = torch.zeros( ( batch_output.size() ) )
        padded_masking_matrix = F.pad( input=masking_matrix, pad=(0, 0, padding_to_first , padding_to_last), mode='constant', value=1) #F.pad(input=source, pad=(0, 1, 1, 1), mode='constant', value=0)

        #reshaping and transposing the batch_output and the masking matrix so that they can be applied on self.last_inferences
        ready_output_matrix = torch.transpose( torch.reshape( padded_batch_output , (-1, self.number_of_components , self.vocab_size ) ) , dim0=0 , dim1=1).to('cpu')
        ready_masking_matrix = torch.transpose( torch.reshape( padded_masking_matrix , (-1, self.number_of_components , self.vocab_size ) )  , dim0=0 , dim1=1)
        
        #Updating self.last_inferences
        self.last_inferences[: , first_elem_sec_dim_index:last_elem_sec_dim_index+1 , :] = self.last_inferences[: , first_elem_sec_dim_index:last_elem_sec_dim_index+1 , :] * ready_masking_matrix
        self.last_inferences[: , first_elem_sec_dim_index:last_elem_sec_dim_index+1 , :] = self.last_inferences[: , first_elem_sec_dim_index:last_elem_sec_dim_index+1 , :] + ready_output_matrix

        #Determining which indexes in self.last_inferences(dim=1) the are ready for ensemble:
        start_of_ready_to_ensemble = first_elem_sec_dim_index
        if last_elem_fis_dim_index == self.number_of_components - 1:
            end_of_ready_to_ensemble = last_elem_sec_dim_index
        else:
            end_of_ready_to_ensemble = last_elem_sec_dim_index - 1
        
        #Now self.last_inferences[: , start_of_ready_to_ensemble:end_of_ready_to_ensemble+1 , :] will be passed so that the ensemble is performed on them
        ensembling_output = self.ensemble( self.last_inferences[: , start_of_ready_to_ensemble:end_of_ready_to_ensemble+1 , :] )
        new_word_vector = self.get_the_top_word_from_probability_vector( ensembling_output ) # Getting the top word from the ensemble output
        print( self.get_the_top_word_from_probability_vector( ensembling_output ,top_k=5 ))
        space_vector = np.array( [' '] )
        print('new_word_vector:\n' , new_word_vector)
        new_word_vector = np.char.add( space_vector , new_word_vector )
        #Making changes to the self.datasets_prompts_array to update the prompts we have
        self.new_prompts_array = np.char.add(self.datasets_prompts_array[:,start_of_ready_to_ensemble:end_of_ready_to_ensemble+1], new_word_vector )
        self.datasets_prompts_array[:,start_of_ready_to_ensemble:end_of_ready_to_ensemble+1] = self.new_prompts_array
            

    #from threading import Thread
    def pipeline_process(self, batch_size ):
        # works on self.all_prompts_in_1D as data. devides the data into batch_sizes and performs the operation on a batch. It handels the output of each operation on a batch
        #inputs:
            #data: list of prompts
            #batch_size: integer indicating the size of a batch.
            #operation_on_a_batch: It is a function that is applied on each batch.
            #output_handling: a function responsible for handling the output of each operation on a batch.
        # Thread_list = []
        number_of_iterations = 1
        for i in range(number_of_iterations):
            data = self.all_prompts_in_1D
            data_size = len(data)
            number_of_batches = int(data_size/batch_size)
            print('iteration_number: ' , i)
            for index in range( number_of_batches ):
                print('batch_number: ' , index)
                batch_index = index*batch_size
                if ( batch_index + 2*batch_size > data_size ):
                    batch = data[ batch_index : ]
                else:
                    batch = data[ batch_index : batch_index + batch_size ]
                # if Thread_list:
                #     Thread_list.pop(0).join()
                batch_output = self.get_next_word_probabilities( batch )
                self.output_handeling(batch_index , batch_output)
                # output_handeling_thread = Thread(target=self.output_handeling , args=(batch_index , batch_output) )
                # Thread_list.append(output_handeling_thread)
                # output_handeling_thread.start()
            self.all_prompts_in_1D = self.datasets_prompts_array.flatten('F').tolist()
            
        # for i in Thread_list:
        #     i.join()


# sentence = "I enjoy walking in the"
# model = LMHeadModel("huggyllama/llama-7b")
# model.get_next_word_probabilities(sentence, top_k=500)

In [1]:
import torch
from transformers import AutoModelForCausalLM , AutoTokenizer

# device = 'cpu'
device = 'cuda:2'
model_name = 'huggyllama/llama-7b'
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.45s/it]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [36]:
# sentences = ["I enjoy walking in the" , 'What is the whole' , 'Jacke is the worst' , 'Who wants to be a']
# sentence = "I enjoy walking in the dark, and"
# get_next_word_probabilities(sentences, top_k=5)
directory = './DAIL-SQL/dataset/process/'
dataset1 = 'BIRD-TEST_SQL_0-SHOT_CTX-200_ANS-2048'
dataset2 = 'BIRD-TEST_SQL_1-SHOT_EUCDISMASKPRESKLSIMTHR_QA-EXAMPLE_CTX-200_ANS-2048'
dataset3 = 'BIRD-TEST_SQL_3-SHOT_EUCDISMASKPRESKLSIMTHR_QA-EXAMPLE_CTX-200_ANS-2048'
dataset_list = [dataset1 , dataset2 , dataset3]
batch_size = 3 #batch_size should be bigger that the number of componenets in this setting.

word_ensemble = LLM_Word_Level_Ensemble(model, device , tokenizer)
word_ensemble.load_prompts_from_datasets(directory , dataset_list , starting_index=0, ending_index=1)
word_ensemble.pipeline_process( batch_size )

iteration_number:  0
batch_number:  0
inputs:  tensor([[    2,     2,     2,  ...,    13,  6404, 29871],
        [    2,     2,     2,  ...,    13,  6404, 29871],
        [    1,  4949,  3834,  ...,    13,  6404, 29871]], device='cuda:2')
type(inputs):  <class 'torch.Tensor'>
inputs.size():  torch.Size([3, 1511])
ensemble, output array shape:  torch.Size([3, 1, 32000])
topk_candidates_indexes:  tensor([[323]])
topk_candidates_indexes:  tensor([[  323, 18134, 29896,  7307,   421]])
[['T' 'MAX1' 'CD' '`']]
new_word_vector:
 ['T']


In [33]:
word_ensemble.pipeline_process( batch_size )

iteration_number:  0
batch_number:  0
ensemble, output array shape:  torch.Size([3, 1, 32000])
topk_candidates_indexes:  tensor([[323]])
topk_candidates_indexes:  tensor([[  323, 29871,    13,   313,  3895]])
[['T' '(' 'FROM']]
new_word_vector:
 ['T']


In [34]:
word_ensemble.printPrompt(2 , 0)

/* Some SQL examples are provided based on similar problems: */
/* Answer the following: What is the highest infant mortality rate per thousand of the countries whose inflation is under 3? */
SELECT MAX(T2.Infant_Mortality) FROM economy AS T1 INNER JOIN population AS T2 ON T1.Country = T2.Country WHERE T1.Inflation < 3

/* Answer the following: List at least 5 students who has the longest absense from schoool? longest absense refers to MAX(month) */
SELECT name FROM longest_absense_from_school ORDER BY month DESC LIMIT 5

/* Answer the following: What is the number of unemployed and bankrupt students? */
SELECT COUNT(T1.name) FROM unemployed AS T1 INNER JOIN filed_for_bankrupcy AS T2 ON T1.name = T2.name

/* Given the following database schema: */
CREATE TABLE frpm
(
    CDSCode                                       TEXT not null
        primary key,
    `Academic Year`                               TEXT  null,
    `County Code`                                 TEXT  null,
    `District

In [6]:
word_ensemble.all_prompts_in_1D[0]

'/* Given the following database schema: */\nCREATE TABLE frpm\n(\n    CDSCode                                       TEXT not null\n        primary key,\n    `Academic Year`                               TEXT  null,\n    `County Code`                                 TEXT  null,\n    `District Code`                               INTEGER         null,\n    `School Code`                                 TEXT  null,\n    `County Name`                                 TEXT null,\n    `District Name`                               TEXT null,\n    `School Name`                                 TEXT null,\n    `District Type`                               TEXT null,\n    `School Type`                                 TEXT null,\n    `Educational Option Type`                     TEXT null,\n    `NSLP Provision Status`                       TEXT null,\n    `Charter School (Y/N)`                        INTEGER    null,\n    `Charter School Number`                       TEXT  null,\n    `Charter Fundin