In [1]:
# Function to fill empty answers on the dataset
def fill_empty_answers(example):
    # If answers are empty, fill with "I don't know"
    if not example['answers']['text']:
        example['answers']['text'] = ["I don't know"]
    return example

In [2]:
# Function to convert time calculated just on seconds into time format days/hours/minutes/seconds
def time_conversion(seconds):
    
    # Calculate days
    days = int(seconds // (24 * 3600))
    # Subtract days to find remaining seconds
    seconds %= 24 * 3600
    
    # Calculate hours
    hours = int(seconds // 3600)
    # Subtract hours to find remaining seconds
    seconds %= 3600
    
    # Calculate minutes
    minutes = int(seconds // 60)
    # Subtract minutes to find remaining seconds
    seconds = round(seconds % 60, 2)  # Round seconds if necessary
    
    return f"{days}:{hours}:{minutes}:{seconds}"

In [4]:
def get_qa_from_squad_v2_textGen(model, 
                                 model_name, 
                                 tokenizer, 
                                 dataset, 
                                 dataset_name, 
                                 device, 
                                 deployment_name, 
                                 client,
                                 n_answers=10, 
                                 max_tokens=2048, 
                                 temperature=0, 
                                 shuffle_dataset=False, 
                                 batch_size=10, 
                                ):

    """
    Generate answers from a model using a given dataset.
    
    Args:
        model: the Question Answering model to generate answers.
        model_name: the name of the model being used.
        tokenizer: the tokenizer associated with the model.
        dataset: the dataset containing context and question pairs.
        dataset_name: the name of the dataset being used.
        device: the device (GPU or CPU) on which the model runs.
        deployment_name : name of the deployed model.
        client : Azure client credentials.
        n_answers: the number of answers to generate.
        max_tokens: the maximum number of tokens allowed for answer generation (default 1024).
        temperature: the sampling temperature for generation (default 0).
        shuffle_dataset: a flag indicating whether to shuffle the dataset before processing.
        batch_size: the size of each batch for answer generation (default 10).
    
    Returns:
        A dictionary containing all dataset and model answers along with dataset questions and dataset contexts and other informations.
    """

    # Record start time for the entire cicle (variable to affect to 'test_specifics.txt' file at the end)
    global start_timestamp
    start_timestamp = datetime.now(pytz.timezone('Europe/Paris'))
    formatted_start_timestamp = start_timestamp.strftime("%Y-%m-%d_%H:%M:%S")

    # Record simplified version of the model name
    match = re.search(r'/([^/]*)$', model_name)
    global simplified_model_name
    simplified_model_name = match.group(1)

    # Retrieve the padding token ID
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    padding_value = tokenizer.pad_token_id
    
    # Record parameters values (variable to affect to 'test_specifics.txt' file at the end)
    params = {
        'n_answers':n_answers,
        'max_tokens':max_tokens,
        'temperature':temperature,
        'shuffle_dataset':shuffle_dataset,
        'batch_size':batch_size
    }

    # Ensure batch_size is at least 16
    if batch_size > 16:
        print("Batch size reduced to 16 (max allowed by Azure Open AI)")
    batch_size = min(batch_size, 16)

    # # Ensure that batch size and n_answers have a reminder of 0
    assert n_answers % batch_size == 0, "The remainder of n_answers divided by batch_size should be 0."
    
    # Shuffle dataset if specified
    if shuffle_dataset:
        dataset = dataset.shuffle()

    # Record generation start time
    start_time = time.time()
    for batch_start in range(0, n_answers, batch_size):

        # Store data into a df
        batch = dataset[batch_start:batch_start + batch_size]
        df = pd.DataFrame(batch)

        prompts = [
            [
                {
                    "role": "user", 
                    "content": 
                        "Context information is below.\n"
                        "---------------------\n"
                        "{context_str}\n"
                        "---------------------\n"
                        "Given the context information and no prior knowledge, answer the Query as concisely as possible. Be brief and to the point.\n"
                        "Instructions: \n"
                        "- If you're unsure of your answer, simply say 'I don't know'. \n"
                        "- If the answer is hardly found in the provided context information, simply say 'I don't know'. \n"
                        "Query: {query_str}\n"
                        "Your answer: "
                }
            ]
            for context, question in zip(df['context'], df['question'])
        ]

        # Replace placeholders with actual data from dataframe
        for prompt, context, question in zip(prompts, df['context'], df['question']):
            prompt[0]['content'] = prompt[0]['content'].format(context_str=context, query_str=question)

        # Initialize empty lists for input_ids and attention_mask
        input_ids_list = []
        attention_masks_list = []
        
        for prompt in prompts:
            # Apply chat template
            tokens = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True)
            inputs = tokenizer.prepare_for_model(tokens, padding=False, return_tensors="pt")  # Padding set to False
            inputs = inputs.to(device)
            input_ids_list.append(inputs['input_ids'])
            attention_masks_list.append(inputs['attention_mask'])
        
        # Function to perform left padding
        def left_pad(tensor, max_len, pad_value, device):
            padding_tensor = torch.full((max_len - tensor.size(0),), pad_value, dtype=tensor.dtype, device=device)
            return torch.cat([padding_tensor, tensor], dim=0)

        # Find the max length across all input_ids
        max_length = max(tensor.size(0) for tensor in input_ids_list)

        # Left-pad the input_ids and attention_masks
        input_ids_padded = torch.stack([left_pad(tensor.squeeze(), max_length, padding_value, device=device) for tensor in input_ids_list])
        attention_masks_padded = torch.stack([left_pad(tensor.squeeze(), max_length, 0, device=device) for tensor in attention_masks_list])

        # Combine the batched input_ids and attention_mask into a dictionary
        tokenized_inputs = {
            'input_ids': input_ids_padded,
            'attention_mask': attention_masks_padded
        }

        # Record start time for batch generation
        batch_start_time = time.time()
        
        # Inference in no_grad context to save memory
        with torch.no_grad():
            # Generate outputs
            outputs = model.generate(
                **tokenized_inputs, 
                max_length=max_tokens, 
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True
            )

        # Record end time for batch generation
        batch_end_time = time.time()
        batch_generation_time = batch_end_time - batch_start_time

        # Decode model outputs
        decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

        # Preparing model answers batch to get their embeddings
        pattern = r"Your answer:\s*(.*)"
        batch_model_answer_texts = [re.search(pattern, decoded_output, re.DOTALL).group(1) for decoded_output in decoded_outputs if re.search(pattern, decoded_output, re.DOTALL)]
        
        # Preparing dataset answers batch to get their embeddings
        batch_dataset_answer_texts = [dataset_answer['text'][0] for idx, dataset_answer in enumerate(df['answers'])]

        # Obtaining embeddings for both the model answers and the dataset answers (normalized)
        normalized_batch_model_answer_texts = [normalize_text(model_answer) for model_answer in batch_model_answer_texts]
        normalized_batch_dataset_answer_texts = [normalize_text(dataset_answer) for dataset_answer in batch_dataset_answer_texts]
        batch_model_answers_embeddings = generate_embeddings(normalized_batch_model_answer_texts)
        batch_dataset_answers_embeddings = generate_embeddings(normalized_batch_dataset_answer_texts)

        # Obtaining the cosine similarity between the model and the dataset answer
        answers_cosine_similarity = [
            cosine_similarity(batch_model_answers_embeddings.data[idx].embedding, batch_dataset_answers_embeddings.data[idx].embedding)
            for idx in range(batch_size)
        ]

        # Counting the input and output tokens
        n_input_tokens = [len(tiktoken_tokenizer.encode(str(prompt))) for prompt in prompts]
        n_output_tokens = [len(tiktoken_tokenizer.encode(normalized_model_answer)) for normalized_model_answer in normalized_batch_model_answer_texts]

        # Getting the average generation time per example in the batch
        time_parts = time_conversion(batch_generation_time).split(':')
        total_seconds = float(time_parts[-1]) + float(time_parts[-2]) * 60 + float(time_parts[-3]) * 3600 + float(time_parts[-4]) * 86400
        avg_generation_time = time_conversion(total_seconds / batch_size)
        
        # Loop through all model answers
        for idx, decoded_output in enumerate(decoded_outputs):

            # Extract data
            dataset_question = df['question'][idx]
            dataset_context = df['context'][idx]
            dataset_answer = batch_dataset_answer_texts[idx]
            model_answer = batch_model_answer_texts[idx]
            dataset_answer_embedding = batch_dataset_answers_embeddings.data[idx].embedding
            model_answer_embedding = batch_model_answers_embeddings.data[idx].embedding
            cosine_similarity_between_answers = answers_cosine_similarity[idx]
            n_input_token = n_input_tokens[idx]
            n_output_token = n_output_tokens[idx]

            print(model_answer)

            # Obtaining the timestamp
            timestamp = datetime.now(pytz.timezone('Europe/Paris'))
            formatted_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")
            
            # Collecting data for a single line in the jsonl
            model_line_jsonl = {
                 "timestamp":formatted_timestamp,
                 "generation_time":avg_generation_time,
                 "n_input_token":n_input_token, 
                 "n_output_token":n_output_token, 
                 "dataset_context":dataset_context,
                 "dataset_question":dataset_question, 
                 "dataset_answer":dataset_answer,
                 "model_answer":model_answer
            }

            cosine_similarity_line_jsonl = {
                 "timestamp":formatted_timestamp,
                 "generation_time":avg_generation_time,
                 "n_input_token":n_input_token, 
                 "n_output_token":n_output_token, 
                 "dataset_context":dataset_context,
                 "dataset_question":dataset_question, 
                 "dataset_answer":dataset_answer,
                 "model_answer":model_answer,
                 "cosine_similarity_between_answers":cosine_similarity_between_answers
            }
        
            # Write data to a JSONL file and store it
            for folder in ['model_answers', 'cosine_similarity_model_vs_dataset_answers']:
                results_folder_path = f"./streamed_results/{simplified_model_name}/{folder}/"
                if not os.path.exists(results_folder_path):
                    os.makedirs(results_folder_path)
                file_path = os.path.join(results_folder_path, f"{folder}_{formatted_start_timestamp}_experience.jsonl")
                with open(file_path, "a") as f:
                    if folder == 'model_answers':
                        json.dump(model_line_jsonl, f)
                        f.write('\n')
                    else:
                        json.dump(cosine_similarity_line_jsonl, f)
                        f.write('\n')              

        # Print batch generation time
        print(f"Batch {batch_start} - {batch_start + batch_size} generation time: {time_conversion(batch_generation_time)}")

        # Clean GPU memory
        del prompts
        del batch
        del tokenized_inputs
        del outputs
        del decoded_outputs
        torch.cuda.empty_cache()

    # Record end time
    end_time = time.time()
    generation_time = end_time - start_time
    print(f"Total generation time: {time_conversion(generation_time)}")

    # Record end time for the entire cicle (variable to affect to 'test_specifics.txt' file at the end)
    global end_timestamp
    end_timestamp = datetime.now(pytz.timezone('Europe/Paris'))
    formatted_end_timestamp = end_timestamp.strftime("%Y-%m-%d_%H:%M:%S"),
        

    # Rename every jsonl in order to have start daytime and end daytime for every experience.
    for folder in ['model_answers', 'cosine_similarity_model_vs_dataset_answers']:
        results_folder_path = f"./streamed_results/{simplified_model_name}/{folder}/"
        file_path = os.path.join(results_folder_path, f"{folder}_{formatted_start_timestamp}_experience.jsonl")
        new_file_path = os.path.join(results_folder_path, f"{folder}_{formatted_start_timestamp}_{formatted_end_timestamp}_experience.jsonl")
        os.rename(file_path, new_file_path)

    # Recording specifics for the test
    write_test_experience_specifics(model_name=simplified_model_name, 
                                    dataset_name=dataset_name, 
                                    params=params, 
                                    generation_time=generation_time, 
                                    start_timestamp=start_timestamp, 
                                    end_timestamp=end_timestamp)
    
    start_formatted_timestamp = start_timestamp.strftime("%Y-%m-%d %H:%M:%S")
    start_timestamp_day, start_timestamp_hour = start_formatted_timestamp.split(' ')
    
    end_formatted_timestamp = end_timestamp.strftime("%Y-%m-%d %H:%M:%S")
    end_timestamp_day, end_timestamp_hour = end_formatted_timestamp.split(' ')

    return [start_timestamp_day, start_timestamp_hour, end_timestamp_day, end_timestamp_hour]

In [5]:
# Function to normalize text by removing redundant spaces and specific punctuation patterns
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    if s:
        return s
    else:
        return " "

In [6]:
# # Function to generate embeddings for text using a specified model
def generate_embeddings(batch_texts, model="text-embedding-ada-002_RD_PROJECTS_DEV"):
    embeddings = client.embeddings.create(
        input=batch_texts,
        model="text-embedding-ada-002_RD_PROJECTS_DEV" 
    )
    return embeddings

In [7]:
# Function to calculate cosine similarity between two embeddings.
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [8]:
# Function for recording all model generation parameter
def write_test_experience_specifics(model_name=None, 
                                    dataset_name=None, 
                                    params=None, 
                                    generation_time=None, 
                                    start_timestamp=None, 
                                    end_timestamp=None, 
                                    add_score=False):
    if add_score == False:
        # Format timestamps
        start_formatted_timestamp = start_timestamp.strftime("%Y-%m-%d %H:%M:%S")
        end_formatted_timestamp = end_timestamp.strftime("%Y-%m-%d %H:%M:%S")
        
        # Write test experience specifics to a text file
        with open(f'../home/drossini/streamed_results/{model_name}/tests_specifics.txt', "a", encoding='utf-8') as f:
            # Write test details
            f.write("__________________________________________________________________________________________________________________________\n")      
            f.write(f"Test on = {start_formatted_timestamp}\n")
            f.write(f"Until = {end_formatted_timestamp}\n")
            f.write(f"Model = {model_name}\n")
            f.write(f"Dataset = {dataset_name}\n")
            f.write(f"Parameters:\n")
            # Write test parameters
            for param_name, param_value in params.items():
                f.write(f'\t- {param_name} = {param_value}\n')
            f.write(f"Total generation time: {time_conversion(generation_time)}\n")
    else:
        with open(f'../home/drossini/streamed_results/{model_name}/tests_specifics.txt', "a", encoding='utf-8') as f:
            f.write(f"Model score: {get_model_performance(df_final)}\n")
            f.write("_________________________________________________________________________________________________________________________\n")