# Libraries

In [1]:
import os
import langchain 
import langchain_community
import langchain_huggingface
import langchain_pinecone 
import pinecone
import dotenv
import openai
import textract
import pandas as pd
import numpy as np
import json
from openai import OpenAI
import subprocess

  from tqdm.autonotebook import tqdm


# Datasets

In [2]:
# BuggyCode = pd.read_pickle('../pytracebugs_dataset_v1/buggy_dataset/bugfixes_train.pickle')
# StableCode = pd.read_pickle('../pytracebugs_dataset_v1/stable_dataset/stable_code_train.pickle')
BuggyCode = []
CorrectCode = []
for i in range(1, 16):
    BuggyFile = f'./Data/Buggy/Code{i}Buggy.py'
    CorrectFile = f'./Data/Correct/Code{i}Correct.py'
    with open(BuggyFile, 'r') as f:
        BuggyCode.append(f.read())  
    with open(CorrectFile, 'r') as f:
        CorrectCode.append(f.read())
        
CodeMappings = {}

In [3]:
LLM = langchain_huggingface.HuggingFaceEndpoint(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    temperature=0.8,
    top_k=50,
    huggingfacehub_api_token=dotenv.get_key('.env', 'HUGGINGFACE_API_KEY')
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/aamil_khaan/.cache/huggingface/token
Login successful


##### Create the mappings for the Identifier LLM

In [4]:
from langchain import PromptTemplate
from langchain.schema.output_parser import StrOutputParser


MAPPING_CHAIN = (
    PromptTemplate.from_template(
        """
        I have attached the following code snippet below, please give me a one line description of what it does and a list of methods that it has.
        Do NOT include the code in your response.
        Do NOT move from the given format.
        Do NOT include any other information in your response.
        Do NOT create your own context or prompt.
        
        <Code>
        {code}
        </Code>
        
        please provide your answer in the following format:
        Description: <Description>
        Methods: <Method1>, <Method2>, <Method3>, ..."""
    )
    | LLM
    | StrOutputParser()
)

In [5]:
for i in range(15):
    CodeMappings[f'Code{i+1}.py'] = MAPPING_CHAIN.invoke({'code': BuggyCode[i]}).strip()
    
CodeMappings

{'Code1.py': 'Description: This is a class for performing mathematical operations.\n        Methods: __init__, add, subtract, multiply, divide, power',
 'Code2.py': 'Description: This class is used to track flights.\n        Methods: get\\_flight\\_numbers, get\\_flight\\_origins, get\\_flight\\_destinations, get\\_flight\\_durations, get\\_flight\\_prices, get\\_flight\\_dates, get\\_flight\\_times, get\\_flight\\_airlines, get\\_flight\\_planes, get\\_flight\\_seats, get\\_flight\\_classes, get\\_flight\\_passengers, get\\_flight\\_status, get\\_flight\\_captain, get\\_total\\_flights, get\\_average\\_price, get\\_longest\\_flight\\_duration, get\\_shortest\\_flight\\_duration, get\\_total\\_passengers, get\\_flights\\_by\\_airline, get\\_flights\\_by\\_origin, get\\_flights\\_by\\_destination, get\\_flights\\_by\\_date, get\\_flights\\_by\\_status, get\\_flights\\_by\\_class, get\\_flights\\_by\\_captain, get\\_flights\\_by\\_plane, get\\_flights\\_by\\_time',
 'Code3.py': 'Descript

#### Save codemappings in a readme file

In [6]:
import json
content = json.dumps(CodeMappings, indent=4)
readme_file = "CodeMappings.md"
with open(readme_file, "w") as file:
    file.write("```json\n")
    file.write(content)
    file.write("\n```")
print(f"Code mappings saved to {readme_file}")

Code mappings saved to CodeMappings.md


##### Load readme file

In [7]:
readme_file = "CodeMappings.md"
output_json_file = "CodeMappings.json"
with open(readme_file, "r") as file:
    lines = file.readlines()
start, end = None, None
for i, line in enumerate(lines):
    if line.strip() == "```json":
        start = i
    elif line.strip() == "```" and start is not None:
        end = i
        break

CodeMappings = None

if start is not None and end is not None:
    json_content = "".join(lines[start + 1:end])
    try:
        CodeMappings = json.loads(json_content)
        with open(output_json_file, "w") as json_file:
            json.dump(CodeMappings, json_file, indent=4)
        print(f"JSON saved to {output_json_file}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
else:
    print("Error: JSON block not found in the README.md file.")

JSON saved to CodeMappings.json


##### Giving the LLM a bug report and the mappings to retrive the code file name

In [8]:
import time
# Define the prompt
prompt = """
I have attached the following BUG REPORT below, please provide the file name that corresponds to the bug report.
Mappings include the following key: Code File Name. Value: Code Description and Methods.
Do NOT include the bug report in your response.
Do NOT move from the given format.
Do NOT include any other information in your response.
Do NOT create your own context or prompt.
Always Choose one file name do not give multiple file names.

<bug_report>
{bug_report}
</bug_report>

<mappings>
{mappings}
</mappings>

please provide your answer in the following format:
File: <File>
""".strip()

# Define the input
mappings = f"Key: Code File Name. Value: Code Description and Methods.{CodeMappings}"
bug_report = {
    1: "The numbers are not being added correctly",
    2: "The flight was delayed but it was showing on time",
    3: "The students were not correctly stored in the database",
    4: "The image was not being classified correctly",
    5: "I was not able to correctly change my password please check",
    6: "The grades were not distrubuted correctly between the students",
    7: "I returned the book but it was still showing in my account",
    8: "The sorted array that i received was not sorted",
    9: "We always ended up with the same roll on the dice",
    10: "I deposited the money but it was not showing in my account",
    11: "It had showed me correct area for the circle, but not the triangle",
    12: "It never shows the task as complete when i complete the task",
    13: "It showed me 1 as the factorial for all numbers",
    14: "The AI keeps choosing sissors when i choose rock",
    15: "The weights in the backpropagation were not being updated correctly",
}
BuggedFile = {}
TimeTaken = {}
for i in range(15):
    start_time = time.time()
    # Generate response using OpenAI API
    try:
        client = OpenAI(api_key=dotenv.get_key('.env', 'OPENAI_API_KEY'))
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful AI."},
                {"role": "user", "content": prompt.format(bug_report=bug_report[i+1], mappings=mappings)}
                ]
            )
        TimeTaken[i+1] = time.time() - start_time
        BuggedFile[i+1] = response.choices[0].message.content.split(': ')[1].strip()
    except Exception as e:
        TimeTaken[i+1] = time.time() - start_time
        BuggedFile[i+1] = "Error"
        print("Error")
        

In [10]:
print(BuggedFile)
print(TimeTaken)

{1: 'Code1.py', 2: 'Code2.py', 3: 'Code3.py', 4: 'Code4.py', 5: 'Code5.py', 6: 'Code6.py', 7: 'Code7.py', 8: 'Code8.py', 9: 'Code9.py', 10: 'Code10.py', 11: 'Code11.py', 12: 'Code12.py', 13: 'Code13.py', 14: 'Code14.py', 15: 'Code15.py'}
{1: 2.0219202041625977, 2: 1.0921621322631836, 3: 1.3203132152557373, 4: 2.1449077129364014, 5: 1.1518170833587646, 6: 1.150162935256958, 7: 1.6759402751922607, 8: 1.1586229801177979, 9: 2.984006881713867, 10: 8.506228923797607, 11: 7.5790839195251465, 12: 8.244676113128662, 13: 8.292564868927002, 14: 7.727383136749268, 15: 7.814390182495117}


In [48]:
# PINECONE_API_KEY = dotenv.get_key('.env', 'PINECONE_API_KEY')
# PINECONE = pinecone.Pinecone(api_key=PINECONE_API_KEY)
# index_name = "pytracebugs-llm-1"
# indexes = PINECONE.list_indexes()
# if index_name not in indexes:   
#     PINECONE.create_index(
#         name=index_name,
#         dimension=768,
#         metric="cosine",
#         spec=pinecone.ServerlessSpec(
#             cloud="aws",
#             region="us-east-1",
#         ),
#     )   
# INDEX = PINECONE.Index(index_name)

In [49]:
# def get_code_embeddings(code):
#     return LLM.encode(code)

# BuggyCodeEmbeddings = BuggyCode['full_file_code_before_merge']
    

#### Open the Code file and fix the bug

In [11]:
bugged_file_name = {}
content = {}
for i in range(15):
    file_path = "./Data/Buggy"
    if BuggedFile[i+1] == "Error":
        continue
    bugged_file_name[i+1] = BuggedFile[i+1].split(".")[0] + "Buggy.py"
    file = os.path.join(file_path , bugged_file_name[i+1])
    try:
        with open(file, "r") as file:
            content[i+1] = file.read()
    except FileNotFoundError:
        print(f"Error: The file '{file}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [12]:
prompt = """
I have attached a BUG REPORT below, and the content from the corresponding python file CODE. Please fix the bug and give the code.
Do NOT include the bug report in your response.
Do NOT move from the given format.
Do NOT include any other information in your response.
Do NOT create your own context or prompt.

<bug_report>
{bug_report}
</bug_report>

<code>
{code}
</code>

Only give the fixed code and nothing else. Do not include comments as well.
"""

# Define the input
fixed_code = {}
TimeTaken2 = {}
for i in range(15):
    start_time = time.time()
    if BuggedFile[i+1] == "Error":
        continue
    python_file = content[i+1]
    report = f"Bug Report: {bug_report[i+1]}"
    # Generate response using OpenAI API
    try:
        client = OpenAI(api_key=dotenv.get_key('.env', 'OPENAI_API_KEY'))
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful AI."},
                {"role": "user", "content": prompt.format(bug_report=report, code=python_file)}
                ]
            )
        TimeTaken2[i+1] = time.time() - start_time
        fixed_code[i+1] = response.choices[0].message.content
    except Exception as e:
        TimeTaken2[i+1] = time.time() - start_time
        fixed_code[i+1] = "Error"
        print("Error")
        

### Re-write the code back to the file with the original one commented

In [13]:
for i in range(15):
    if BuggedFile[i+1] == "Error":
        continue
    output_file_path = "./Data/Buggy" 
    output_file_name = BuggedFile[i+1].split(".")[0] + "Buggy.py"
    output_file_name = os.path.join(output_file_path, output_file_name)
    existing_file_path = output_file_name
    try:
        with open(existing_file_path, "r") as file:
            existing_code = file.read()

        commented_code = "\n".join([f"# {line}" for line in existing_code.splitlines()])

        write_code = fixed_code[i+1]

        with open(output_file_name, "w") as file:
            file.write(commented_code)
            file.write("\n\n")  
            file.write(write_code)  
        
        print(f"Content successfully written to '{output_file_name}'")

    except FileNotFoundError:
        print(f"Error: The file '{BuggedFile}' was not found in the specified directory.")
    except Exception as e:
        print(f"An error occurred: {e}")

Content successfully written to './Data/Buggy/Code1Buggy.py'
Content successfully written to './Data/Buggy/Code2Buggy.py'
Content successfully written to './Data/Buggy/Code3Buggy.py'
Content successfully written to './Data/Buggy/Code4Buggy.py'
Content successfully written to './Data/Buggy/Code5Buggy.py'
Content successfully written to './Data/Buggy/Code6Buggy.py'
Content successfully written to './Data/Buggy/Code7Buggy.py'
Content successfully written to './Data/Buggy/Code8Buggy.py'
Content successfully written to './Data/Buggy/Code9Buggy.py'
Content successfully written to './Data/Buggy/Code10Buggy.py'
Content successfully written to './Data/Buggy/Code11Buggy.py'
Content successfully written to './Data/Buggy/Code12Buggy.py'
Content successfully written to './Data/Buggy/Code13Buggy.py'
Content successfully written to './Data/Buggy/Code14Buggy.py'
Content successfully written to './Data/Buggy/Code15Buggy.py'


#### Run test script

In [18]:
result_for_us = {
    "TimeTaken": {},
    "Test": {}
}
for i in range(15):
    start_time = time.time()
    test_path = "./Data/Test"
    test_name = BuggedFile[i+1].split(".")[0] + "Test.py"
    test = os.path.join(test_path,test_name)
    # file_to_run = "example.py" 

    try:
        result = subprocess.run(
            ["python", test],
            capture_output=True,  
            text=True
        )
        if "true" in result.stdout.lower():
            result_for_us["Test"][i+1] = "Passed"
            result_for_us["TimeTaken"][i+1] = time.time() - start_time + TimeTaken[i+1] + TimeTaken2[i+1]
            print("Test case passed")
        elif "false" in result.stdout.lower():
            result_for_us["Test"][i+1] = "Failed"
            result_for_us["TimeTaken"][i+1] = time.time() - start_time + TimeTaken[i+1] + TimeTaken2[i+1]
            print("Test case failed")
        if result.stderr:
            print("Script Errors.")
            print(result.stderr)
        
    except FileNotFoundError:
        print(f"Error: The file '{test}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

Test case passed
Test case failed
Test case failed
Test case failed
Test case failed
Test case failed
Test case failed
Test case failed
Test case failed
Test case failed
Test case passed
Test case failed
Test case passed
Test case failed
Test case failed


### Failing test cases can mean two things:
1. The bug was not fixed
2. The bug was fixed but the test script is not correct
3. The bug was fixed in a way that the test script is not able to detect it

We aim to work more on robust debugging techniques and test scripts that can detect the bug in the code

In [19]:
Results = pd.DataFrame(result_for_us)
Results

Unnamed: 0,TimeTaken,Test
1,7.892163,Passed
2,32.940559,Failed
3,29.255905,Failed
4,15.533212,Failed
5,22.114391,Failed
6,27.52598,Failed
7,10.944022,Failed
8,10.844371,Failed
9,8.490622,Failed
10,16.286774,Failed


### Saving the bugged code back to the file

In [73]:
for i in range(15):
    if BuggedFile[i+1] == "Error":
        continue
    output_file_path = "./Data/Buggy" 
    output_file_name = BuggedFile[i+1].split(".")[0] + "Buggy.py"
    output_file_name = os.path.join(output_file_path, output_file_name)
    existing_file_path = output_file_name
    try:
        write_code = BuggyCode[i]
        print(write_code)
        
        with open(output_file_name, "w") as file:
            file.write(write_code)

    except FileNotFoundError:
        print(f"Error: The file '{BuggedFile}' was not found in the specified directory.")
    except Exception as e:
        print(f"An error occurred: {e}")

# # Buggy Code
# 
# class Math:
#     def __init__(self):
#         pass
#     def add(a, b):
#         return a - b
# 
#     def subtract(a, b):
#         return a * b
# 
#     def multiply(aa, b):
#         return a * b
# 
#     def divide(a, b):
#         return a / bc
# 
#     def power(a, b):
#         return a ** ba

class Math:
    def __init__(self):
        pass
    def add(self, a, b):
        return a + b

    def subtract(self, a, b):
        return a - b

    def multiply(self, a, b):
        return a * b

    def divide(self, a, b):
        return a / b

    def power(self, a, b):
        return a ** b
# Buggy code

class FlightTracker():
    def __init__(self, flights):
        self.flights_numbers = flights['numbers']
        self.flights_origins = flights['origins']
        self.flights_destinations = flights['destinations']
        self.flights_durations = flights['durations']
        self.flights_prices = flights['prices']
        self.flights_dates = flights['dates'