In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import json
import os
import csv
import shutil
from xml.etree import ElementTree as ET
from sentence_transformers import SentenceTransformer
import subprocess
import numpy as np
import faiss
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Path to the knowledge DB
KNOWLEDGE_DB_ALL_TYPE = '../knowledge_db_all_type.json'
KNOWLEDGE_DB_SELECTED_TYPE = '../knowledge_db_selected_type.json'

# Load the knowledge DB
with open(KNOWLEDGE_DB_ALL_TYPE, 'r') as file:
    data_all_type = json.load(file)

with open(KNOWLEDGE_DB_SELECTED_TYPE, 'r') as file:
    data_selected_type = json.load(file)

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

# Load the model and tokenizer from the specified pre-trained model
model_name = "ise-uiuc/Magicoder-S-DS-6.7B"
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_cache=False,
    device_map="auto",
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 6/6 [00:30<00:00,  5.05s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Initialize embedding model and create FAISS index
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
# Function to retrieve relevant APIs based on the query
def get_retrieved_apis(jar_name, query, k=50):
    api_names = [fqn.split('.')[-1] for fqn in data_selected_type.get(jar_name, {}).get('fqns', [])]
    embeddings = embedding_model.encode(api_names)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings).astype('float32'))

    query_embedding = embedding_model.encode([query])
    if len(api_names) < 50:
        k = len(api_names)
    distances, indices = index.search(query_embedding, k)
    retrieved_apis = []
    seen_apis = set()

    for idx in indices.flatten():
        api_name = api_names[idx]
        normalized_api_name = api_name.replace('_', '').lower()  # Normalize the API names

        if normalized_api_name not in seen_apis:
            retrieved_apis.append(api_name)
            seen_apis.add(normalized_api_name)
            if len(retrieved_apis) == k:
                break

    return retrieved_apis

In [6]:
def generate_api_and_average_confidence(prompt):
    
    MAGICODER_PROMPT = f"""You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.

@@ Instruction
{prompt}.

@@ Response
"""
    # Encode the prompt to tensor and move it to the device
    input_ids = tokenizer.encode(MAGICODER_PROMPT, return_tensors='pt')

    if torch.cuda.is_available():
        torch.cuda.synchronize()
        input_ids = input_ids.to('cuda')
    
    # Set the model to evaluation mode
    model.eval()
    with torch.no_grad():  # Disable gradient computation for inference
        # Generate response tokens from the model
        generated_ids = model.generate(
            input_ids,
            max_length=1024,
            num_return_sequences=1
        )
        # Decode the generated tokens to a string
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        
        # Perform a forward pass to get logits for the generated sequence
        outputs = model(generated_ids)
        logits = outputs.logits
        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(logits, dim=-1)
        # Extract the probabilities for the actual generated tokens
        generated_probs = probs[:, :-1, :].gather(2, generated_ids[:, 1:].unsqueeze(-1)).squeeze(-1)
        # Calculate the average probability (confidence) across all generated tokens
        average_confidence = generated_probs.mean().item()
    
    # Clear unused memory
    torch.cuda.empty_cache()
    # Return the generated text and its average confidence score
    return generated_text, average_confidence

In [7]:
def generate_code_fqn_and_average_confidence(prompt):
    MAGICODER_PROMPT = f"""You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.

@@ Instruction
{prompt}

@@ Response
"""
    input_ids = tokenizer.encode(MAGICODER_PROMPT, return_tensors='pt')

    if torch.cuda.is_available():
        torch.cuda.synchronize()
        input_ids = input_ids.to('cuda')
    
    model.eval()
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids,
            max_length=2048,
            num_return_sequences=5,
            temperature=1.0,  # Using a higher temperature for more diversity
            top_k=50,  # Using top-k sampling
            top_p=0.95,  # Using top-p sampling
            do_sample=True,
            diversity_penalty=0.0,  # No diversity penalty needed in sampling mode
        )
        
        responses = []
        confidences = []

        for i in range(generated_ids.shape[0]):
            generated_text = tokenizer.decode(generated_ids[i], skip_special_tokens=True)
            outputs = model(generated_ids[i:i+1])
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)
            selected_indices = generated_ids[i:i+1, 1:].unsqueeze(-1)
            generated_probs = probs.gather(2, selected_indices).squeeze(-1)
            average_confidence = generated_probs.mean().item()

            responses.append(generated_text)
            confidences.append(average_confidence)

    torch.cuda.empty_cache()

    return responses, confidences

In [8]:
def extract_unique_items(full_response_for_api):
    # This pattern looks for class names after "@@ Response" section, allowing for the class name to be optionally enclosed in backticks or quotes.
    item_pattern = r'@@ Response\n([\s\S]*)$'
    class_pattern = r'\d+\.\s*[`"]?([^\n`]+)[`"]?'

    # Finding the content after "@@ Response" section
    response_section = re.search(item_pattern, full_response_for_api, re.MULTILINE)
    if response_section:
        response_content = response_section.group(1)

        # Finding all class names in the response content
        found_items = re.findall(class_pattern, response_content)

        # Removing duplicates while preserving order
        unique_items = list(dict.fromkeys(found_items))

        # Filtering out items that resemble sentences or are just numbers
        filtered_items = [item for item in unique_items if ' ' not in item and not re.fullmatch(r'[`"\']*[\d.]+[`"\']*', item)]

        return filtered_items
    else:
        return []

In [9]:
def extract_java_code_and_fqns(full_response_for_code_and_fqn):
    # Adjusted pattern to extract Java code block, considering it might not end with ```
    java_code_pattern = r'```java\n(.*?)(?:\n```|$)'
    # Pattern to find import statements
    import_pattern = r'import\s+(.*?);'

    # Extract the Java code block
    java_code_match = re.search(java_code_pattern, full_response_for_code_and_fqn, re.DOTALL)
    java_code = java_code_match.group(1).strip() if java_code_match else ""

    # Find all import statements within the Java code block
    import_statements = re.findall(import_pattern, java_code)

    # Extract import statements without the word "import"
    imports_without_keyword = [stmt for stmt in import_statements]

    # Return the Java code and import statements without the word "import"
    return java_code, imports_without_keyword

In [10]:
def use_wrapper_and_add_dependency(source_path, destination_dir, new_dependency):
    # Adjust the destination path to include the Wrapper directory
    destination_wrapper_path = os.path.join(destination_dir, 'Wrapper')
    
    # Copy the Wrapper folder to the new location inside the destination directory
    shutil.copytree(source_path, destination_wrapper_path, dirs_exist_ok=True)
    
    # Path to the pom.xml file inside the newly copied Wrapper folder
    pom_path = os.path.join(destination_wrapper_path, 'pom.xml')
    
    # Parse the pom.xml file
    tree = ET.parse(pom_path)
    root = tree.getroot()
    
    # Define the namespace to find elements properly
    ns = {'maven': 'http://maven.apache.org/POM/4.0.0'}
    ET.register_namespace('', ns['maven'])  # Register namespace for correct output
    
    # Find the dependencies element
    dependencies = root.find('maven:dependencies', ns)
    if dependencies is None:
        dependencies = ET.SubElement(root, 'dependencies')
    
    # Parse the new dependency XML string and add it
    new_dependency_element = ET.fromstring(new_dependency)
    dependencies.append(new_dependency_element)
    
    xmlstr = ET.tostring(root, encoding='unicode')
    
    # Ensure the closing </dependencies> tag is on a new line
    xmlstr = xmlstr.replace('</dependencies>', '\n</dependencies>')
    
    # Write the modified XML back to the file, with proper formatting
    with open(pom_path, 'w', encoding='utf-8') as file:
        file.write(xmlstr)

In [11]:
def read_package_name_from_main(main_file_path):
    with open(main_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    package_match = re.search(r'^package\s+([\w\.]+);', content, re.MULTILINE)
    if package_match:
        return package_match.group(0)  # Returns the full package declaration
    return ""

def extract_imports_and_classes(code):
    import_pattern = r'(import\s+.+?;)'
    class_pattern = r'(\bpublic\s+)?\bclass\s+(\w+).*?\{([\s\S]*?)\n\}'

    imports = re.findall(import_pattern, code)
    classes = []

    for match in re.finditer(class_pattern, code, re.MULTILINE | re.DOTALL):
        full_declaration = match.group(0)
        contains_main = re.search(r'\bpublic\s+static\s+void\s+main\s*\(\s*String\s*\[\s*\]\s*args\s*\)', match.group(3))
        classes.append((match.group(2), full_declaration, bool(contains_main)))

    return imports, classes

def save_classes_with_imports(main_file_path, package_declaration, imports, classes):
    main_java_path = main_file_path
    base_dir = os.path.dirname(main_java_path)
    import_section = '\n'.join(imports) + '\n\n' if imports else ''
    
    for class_name, class_code, contains_main in classes:
        final_code = f"{import_section}{class_code}"
        if contains_main:
            # Append to Main.java without adding the package declaration
            with open(main_java_path, 'a', encoding='utf-8') as file:
                file.write('\n' + final_code)
        else:
            # For other classes, prepend package declaration (if it exists) and create new .java files
            final_code = f"{package_declaration}\n\n{final_code}" if package_declaration else final_code
            class_file_path = os.path.join(base_dir, f"{class_name}.java")
            with open(class_file_path, 'w', encoding='utf-8') as file:
                file.write(final_code)

In [12]:
def strip_ansi_escape_sequences(text):
    """
    Removes ANSI escape sequences from the given text.
    """
    ansi_escape_pattern = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    return ansi_escape_pattern.sub('', text)

def process_errors(output):
    error_pattern = re.compile(r'\[ERROR\] (.*?\.java):\[(\d+),(\d+)\] (.+)')
    
    matches = error_pattern.findall(output)
    
    errors = [{
        'file': match[0],
        'line': int(match[1]),
        'column': int(match[2]),
        'message': match[3].strip()
    } for match in matches]

    # Remove duplicates by converting the list of dictionaries to a set of tuples
    unique_errors = set(tuple(error.items()) for error in errors)

    # Convert back to a list of dictionaries
    error_list = [dict(tup) for tup in unique_errors]
    
    return error_list

def run_maven_build(command, working_directory):
    # Run the Maven command and capture combined output
    result = subprocess.run(command, cwd=working_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, shell=True)
    full_output = strip_ansi_escape_sequences(result.stdout)
    
    # Check if the build was successful
    if "BUILD SUCCESS" in full_output:
        hallucinations_for_fqns = False
        return {"status": "success", "message": "Build completed successfully.", "hallucinations_for_FQNs": hallucinations_for_fqns}
    
    # If not successful, attempt to parse errors
    error_list = process_errors(full_output)
    
    # Return structured errors if any are found
    if error_list:
        return {"status": "failure", "hallucinations_for_FQNs": "Need manual review", "message": "Build failed. Full error log present.", "full_error_log": full_output, "error_list": error_list}
    else:
        # Handle case where no specific errors are parsed
        return {"status": "failure", "hallucinations_for_FQNs": "Need manual review", "message": "Build failed with unspecified errors.", "error_log": full_output}

In [14]:
# Helper function to extract the base name of a jar, removing version and extension
def get_jar_base_name(jar_name):
    # Typically, versions in jar names are delimited by '-' and end with '.jar'
    # This splits on '-' and excludes the last part assuming it's the version
    parts = jar_name.split('-')
    if len(parts) > 1:
        return '-'.join(parts[:-1])
    return jar_name.rsplit('.', 1)[0]

In [15]:
# Remove the existing directories and create fresh directories
benchmarks_dir = 'Benchmarks'
results_dir = 'Results'
project_dir = 'Projects'

# Create a list of jars that have non-empty 'fqns' and non-null 'meta_data'
eligible_jars = {jar: get_jar_base_name(jar) for jar, info in data_selected_type.items() if info['fqns'] and info['meta_data']}

for directory in [benchmarks_dir, results_dir, project_dir]:
    try:
        shutil.rmtree(directory)
    except FileNotFoundError:
        pass
    os.makedirs(directory)

# Initialize a list to keep track of all results for CSV generation
all_results = []

all_type_fqns_from_jars = []
selected_type_fqns_from_jars = []
    
for key in data_all_type:
    all_type_fqns_from_jars.extend(data_all_type[key]["fqns"])

for key in data_selected_type:
    selected_type_fqns_from_jars.extend(data_selected_type[key]["fqns"])

# Iterate through each jar item in the JSON that has non-empty FQNs and metadata
for jar, info in data_selected_type.items():
    if info["fqns"] and info["meta_data"]:
        jar_name_without_extension = jar.rsplit('.', 1)[0]

        # Get the base name of the current jar
        current_base_name = get_jar_base_name(jar)
        # Create a list of other jars excluding any versions of the current base name
        other_jars = [j for j, base_name in eligible_jars.items() if base_name != current_base_name]
        # Randomly select two different jars from the remaining eligible jars, if possible
        selected_jars = []
        if len(other_jars) >= 2:
            selected_jars = random.sample(other_jars, 2)
        else:
            selected_jars = other_jars  # In case there are less than that the value

        # Prepare the prompt by including the instruction within a predefined context
        context = ""
        for item in selected_jars:
            item_without_extension = item.rsplit('.', 1)[0]
            query = f"Some valid class names for {item_without_extension}."
            retrieved_apis = get_retrieved_apis(item, query)
            formatted_retrieved_apis = "\n".join(f"{index + 1}. {api}" for index, api in enumerate(retrieved_apis))
            context = context + f"List correct API's class names from {item_without_extension} project. Not more than 50 and classes are must be non-abstract. If you do not know the answer then reply with a No.\n\n{formatted_retrieved_apis}\n\n"
        
        prompt_for_api = f"{context}List correct API's class names from {jar_name_without_extension} project. Not more than 50 and classes are must be non-abstract. If you do not know the answer then reply with a No."
        response_for_api, confidence_for_api = generate_api_and_average_confidence(prompt=prompt_for_api)
        full_response_for_api = response_for_api
        extracted_response_for_api = extract_unique_items(full_response_for_api)

        fqns = info["fqns"]
        apis = [fqn.split('.')[-1] for fqn in fqns]

        correct_predictions_for_api = [api for api in extracted_response_for_api if api in apis]
        wrong_predictions_for_api = [api for api in extracted_response_for_api if api not in apis]

        # Construct the JSON data structure
        jar_result = {
            "jar_name": jar,
            "API_prediction": {
                "prompt": prompt_for_api,
                "full_response": full_response_for_api,
                "extracted_APIs_from_prediction": extracted_response_for_api,
                "correct_API_predictions": correct_predictions_for_api,
                "wrong_API_predictions": wrong_predictions_for_api,
                "number_of_correct_APIs": len(correct_predictions_for_api),
                "number_of_wrong_APIs": len(wrong_predictions_for_api)
            }
        }

        not_applicable_value = "N/A"

        # Assigning default values to "N/A"
        prompt_for_code_and_fqn = not_applicable_value
        full_response_for_code_and_fqn = not_applicable_value
        failure_rate_for_apis = not_applicable_value
        average_confidence_for_api = not_applicable_value
        average_confidence_for_code_and_fqn = not_applicable_value
        predicted_java_code = not_applicable_value
        extracted_fqns = not_applicable_value
        correct_predictions_for_fqn = not_applicable_value
        count_correct_prediction_fqn = not_applicable_value
        wrong_predictions_for_fqn = not_applicable_value
        count_wrong_prediction_fqn = not_applicable_value
        failure_rate_for_fqns = not_applicable_value
        hallucinations_for_apis = not_applicable_value

        api = not_applicable_value
        sample = not_applicable_value

        project_path = not_applicable_value

        build_status = not_applicable_value
        build_message = not_applicable_value
        hallucinations_for_fqns = not_applicable_value


        if len(correct_predictions_for_api) == 0:
            if len(wrong_predictions_for_api) == 0:
                failure_rate_for_apis = 1
            else:
                failure_rate_for_apis = len(wrong_predictions_for_api) / (len(correct_predictions_for_api) + len(wrong_predictions_for_api))
            
            average_confidence_for_api = confidence_for_api
            jar_result["API_prediction"]["model_confidence"] = average_confidence_for_api
            jar_result["API_prediction"]["failure_rate_for_APIs"] = failure_rate_for_apis
            
            hallucinations_for_apis = True
            jar_result["API_prediction"]["hallucinations_for_APIs"] = hallucinations_for_apis
            jar_result["Code_and_FQN_predictions"] = "The model could not give any valid API."

            # Add the result to the all_results list for later CSV generation
            all_results.append({
                "JAR": jar,
                "Number of Correct APIs": len(correct_predictions_for_api),
                "Number of Wrong APIs": len(wrong_predictions_for_api),
                "Failure Rate for APIs": failure_rate_for_apis,
                "Model Confidence for APIs": average_confidence_for_api,
                "Hallucinations for APIs": hallucinations_for_apis,
                "API": api,
                "Sample": sample,
                "Number of Correct FQNs": count_correct_prediction_fqn,
                "Number of Wrong FQNs": count_wrong_prediction_fqn,
                "Failure Rate for FQNs": failure_rate_for_fqns,
                "Model Confidence for Code and FQNs": average_confidence_for_code_and_fqn,
                "Project Path": project_path,
                "Build Status": build_status,
                "Build Message": build_message,
                "Hallucinations for FQNs": hallucinations_for_fqns,
            })
        else:
            if len(wrong_predictions_for_api) == 0:
                hallucinations_for_apis = False
            else:
                hallucinations_for_apis = "Partially Present. Need manual review."
            
            failure_rate_for_apis = len(wrong_predictions_for_api) / (len(correct_predictions_for_api) + len(wrong_predictions_for_api))
            average_confidence_for_api = confidence_for_api
            jar_result["API_prediction"]["failure_rate_for_APIs"] = failure_rate_for_apis
            jar_result["API_prediction"]["model_confidence"] = average_confidence_for_api
            jar_result["API_prediction"]["hallucinations_for_APIs"] = hallucinations_for_apis
            
            if "Code_and_FQN_predictions" not in jar_result:
                jar_result["Code_and_FQN_predictions"] = {}
            
            for api_name in correct_predictions_for_api:
                api = api_name

                prompt_for_code_and_fqn = f"Give me a complete, correct and compilable Java code. This code must use \"{api_name}\" API from \"{jar_name_without_extension}\" project along with other necessary APIs. The dependency is already added for \"{jar_name_without_extension}\" project. So, if you need to use other APIs, then do not use APIs whose import statements require adding new dependencies. The name of the class that has the main method should be Main."
                predicted_java_code_all_samples, extracted_fqns_all_samples = generate_code_fqn_and_average_confidence(prompt=prompt_for_code_and_fqn)
                
                if api_name not in jar_result["Code_and_FQN_predictions"]:
                    jar_result["Code_and_FQN_predictions"][api_name] = {
                        "prompt": prompt_for_code_and_fqn
                    }

                for number_of_sample, (code_and_fqn, confidence_for_code_and_fqn) in enumerate(zip(predicted_java_code_all_samples, extracted_fqns_all_samples)):
                    full_response_for_code_and_fqn = code_and_fqn
                    sample = number_of_sample+1

                    predicted_java_code, extracted_fqns = extract_java_code_and_fqns(full_response_for_code_and_fqn)
                    
                    correct_predictions_for_fqn = [fqn for fqn in extracted_fqns if fqn in all_type_fqns_from_jars]
                    count_correct_prediction_fqn = len(correct_predictions_for_fqn)

                    wrong_predictions_for_fqn = [fqn for fqn in extracted_fqns if fqn not in all_type_fqns_from_jars]
                    count_wrong_prediction_fqn = len(wrong_predictions_for_fqn)

                    average_confidence_for_code_and_fqn = confidence_for_code_and_fqn

                    if count_correct_prediction_fqn == 0 and count_wrong_prediction_fqn == 0:
                        failure_rate_for_fqns = 1
                    else:
                        failure_rate_for_fqns = count_wrong_prediction_fqn / (count_correct_prediction_fqn + count_wrong_prediction_fqn)
                        
                    jar_result["Code_and_FQN_predictions"][api_name][f"sample_{sample}"] = {
                        "full_response": full_response_for_code_and_fqn,
                        "extracted_java_code_from_prediction": predicted_java_code,
                        "extracted_FQNs_from_prediction": extracted_fqns,
                        "correct_FQN_predictions": correct_predictions_for_fqn,
                        "wrong_FQN_predictions": wrong_predictions_for_fqn,
                        "number_of_correct_FQNs": count_correct_prediction_fqn,
                        "number_of_wrong_FQNs": count_wrong_prediction_fqn,
                        "failure_rate_for_FQNs": failure_rate_for_fqns,
                        "model_confidence": average_confidence_for_code_and_fqn
                    }

                    result = {}

                    if count_correct_prediction_fqn == 0:
                        build_status = "failure"
                        build_message = "Model could not predict any correct FQNs."
                        hallucinations_for_fqns = True
                        result = {
                            "status": build_status,
                            "message": build_message,
                            "hallucinations_for_FQNs": hallucinations_for_fqns
                        }
                    else:
                        imports, classes = extract_imports_and_classes(predicted_java_code)
                        if classes == []:
                            build_status = "failure"
                            build_message = "No valid Java code found from the model's output."
                            hallucinations_for_fqns = True
                            result = {
                                "status": build_status,
                                "message": build_message,
                                "hallucinations_for_FQNs": hallucinations_for_fqns
                            }
                        else:
                            # Build project with maven command
                            jar_projects_dir = f'Projects/{jar_name_without_extension}/{api_name}/sample_{sample}'
                            os.makedirs(jar_projects_dir, exist_ok=True)

                            main_file_path = f'{jar_projects_dir}/Wrapper/src/main/java/org/example/Main.java'
                            use_wrapper_and_add_dependency(source_path='Wrapper', destination_dir=jar_projects_dir, new_dependency=info["meta_data"])
                            
                            package_name = read_package_name_from_main(main_file_path)
                            
                            save_classes_with_imports(main_file_path, package_name, imports, classes)
                            
                            project_path = main_file_path.rsplit('.', 1)[0].rsplit('/', 1)[0]
                            # Define the Maven command and the directory where to run it
                            maven_command = "mvn clean compile assembly:single"
                            working_directory = f'{jar_projects_dir}/Wrapper'

                            # Run the Maven build
                            result = run_maven_build(maven_command, working_directory)

                            build_status = result["status"]

                            if build_status == "success":
                                correct_predictions_for_fqn = extracted_fqns
                                wrong_predictions_for_fqn = []
                                count_correct_prediction_fqn = len(correct_predictions_for_fqn)
                                count_wrong_prediction_fqn = len(wrong_predictions_for_fqn)
                                failure_rate_for_fqns = count_wrong_prediction_fqn / (count_correct_prediction_fqn + count_wrong_prediction_fqn)
                                jar_result["Code_and_FQN_predictions"][f"{api_name}"][f"sample_{sample}"]["correct_FQN_predictions"] = correct_predictions_for_fqn
                                jar_result["Code_and_FQN_predictions"][f"{api_name}"][f"sample_{sample}"]["wrong_FQN_predictions"] = wrong_predictions_for_fqn
                                jar_result["Code_and_FQN_predictions"][f"{api_name}"][f"sample_{sample}"]["number_of_correct_FQNs"] = count_correct_prediction_fqn
                                jar_result["Code_and_FQN_predictions"][f"{api_name}"][f"sample_{sample}"]["number_of_wrong_FQNs"] = count_wrong_prediction_fqn
                                jar_result["Code_and_FQN_predictions"][f"{api_name}"][f"sample_{sample}"]["failure_rate_for_fqns"] = failure_rate_for_fqns
                            
                            build_message = result["message"]
                            hallucinations_for_fqns = result["hallucinations_for_FQNs"]

                            jar_result["Code_and_FQN_predictions"][f"{api_name}"][f"sample_{sample}"]["code_build"] = {
                                "project_path": project_path
                            }

                    # add the build_result
                    jar_result["Code_and_FQN_predictions"][f"{api_name}"][f"sample_{sample}"]["code_build"] = {
                        "build_result": result
                    }

                    # For each sample, adding the result to the all_results list for later CSV generation
                    all_results.append({
                        "JAR": jar,
                        "Number of Correct APIs": len(correct_predictions_for_api),
                        "Number of Wrong APIs": len(wrong_predictions_for_api),
                        "Model Confidence for APIs": average_confidence_for_api,
                        "Failure Rate for APIs": failure_rate_for_apis,
                        "Hallucinations for APIs": hallucinations_for_apis,
                        "API": api,
                        "Sample": sample,
                        "Number of Correct FQNs": count_correct_prediction_fqn,
                        "Number of Wrong FQNs": count_wrong_prediction_fqn,
                        "Failure Rate for FQNs": failure_rate_for_fqns,
                        "Model Confidence for Code and FQNs": average_confidence_for_code_and_fqn,
                        "Project Path": project_path,
                        "Build Status": build_status,
                        "Build Message": build_message,
                        "Hallucinations for FQNs": hallucinations_for_fqns,
                    })

        # Save the JSON file
        with open(os.path.join(benchmarks_dir, f"{jar_name_without_extension}.json"), 'w') as outfile:
            json.dump(jar_result, outfile, indent=4)


# Generate the CSV file
csv_file_name = "magicoder_summary_rag"
csv_file_path = os.path.join(results_dir, f'{csv_file_name}.csv')
with open(csv_file_path, 'w', newline='') as csvfile:
    fieldnames = [
        "JAR",
        "Number of Correct APIs",
        "Number of Wrong APIs",
        "Failure Rate for APIs",
        "Model Confidence for APIs",
        "Hallucinations for APIs",
        "API",
        "Sample",
        "Number of Correct FQNs",
        "Number of Wrong FQNs",
        "Failure Rate for FQNs",
        "Model Confidence for Code and FQNs",
        "Project Path",
        "Build Status",
        "Build Message",
        "Hallucinations for FQNs"
    ]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for result in all_results:
        writer.writerow(result)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32014 for open-end generati