# Libraries

In [2]:
import os
import langchain 
import langchain_community
import langchain_huggingface
import langchain_pinecone 
import pinecone
import dotenv
import openai
import textract
import pandas as pd
import numpy as np

# Datasets

In [27]:
# BuggyCode = pd.read_pickle('../pytracebugs_dataset_v1/buggy_dataset/bugfixes_train.pickle')
# StableCode = pd.read_pickle('../pytracebugs_dataset_v1/stable_dataset/stable_code_train.pickle')
BuggyCode = []
CorrectCode = []
for i in range(1, 7):
    BuggyFile = f'../Data/Buggy/Code{i}.py'
    CorrectFile = f'../Data/Correct/Code{i}.py'
    with open(BuggyFile, 'r') as f:
        BuggyCode.append(f.read())  
    with open(CorrectFile, 'r') as f:
        CorrectCode.append(f.read())
        
CodeMappings = {
    'Code1.py': 'TO BE FILLED BY LLM',
    'Code2.py': 'TO BE FILLED BY LLM',
    'Code3.py': 'TO BE FILLED BY LLM',
    'Code4.py': 'TO BE FILLED BY LLM',
    'Code5.py': 'TO BE FILLED BY LLM',
    'Code6.py': 'TO BE FILLED BY LLM',
}

In [28]:
LLM = langchain_huggingface.HuggingFaceEndpoint(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    temperature=0.8,
    top_k=50,
    huggingfacehub_api_token=dotenv.get_key('.env', 'HUGGINGFACE_API_KEY')
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/aamil_khaan/.cache/huggingface/token
Login successful


##### Filling the Mappings to create the mappings for the Identifier LLM

In [36]:
from langchain import PromptTemplate
from langchain.schema.output_parser import StrOutputParser


MAPPING_CHAIN = (
    PromptTemplate.from_template(
        """
        I have attached the following code snippet below, please give me a one line description of what it does and a list of methods that it has.
        Do NOT include the code in your response.
        Do NOT move from the given format.
        Do NOT include any other information in your response.
        Do NOT create your own context or prompt.
        
        <Code>
        {code}
        </Code>
        
        please provide your answer in the following format:
        Description: <Description>
        Methods: <Method1>, <Method2>, <Method3>, ..."""
    )
    | LLM
    | StrOutputParser()
)

In [37]:
for i in range(6):
    CodeMappings[f'Code{i+1}.py'] = MAPPING_CHAIN.invoke({'code': BuggyCode[i]}).strip()
    
CodeMappings

{'Code1.py': 'Description: The code defines a class Math with 5 methods for basic arithmetic operations.\n        Methods: add, subtract, multiply, divide, power',
 'Code2.py': 'Description: This is a FlightTracker class that has methods to retrieve various flight details from a given data.\n        Methods: get_flight_numbers, get_flight_origins, get_flight_destinations, get_flight_durations, get_flight_prices, get_flight_dates, get_flight_times, get_flight_airlines, get_flight_planes, get_flight_seats, get_flight_classes, get_flight_passengers, get_flight_status, get_flight_captain, get_total_flights, get_average_price, get_longest_flight_duration, get_shortest_flight_duration, get_total_passengers, get_flights_by_airline, get_flights_by_origin, get_flights_by_destination, get_flights_by_date, get_flights_by_status, get_flights_by_class, get_flights_by_captain, get_flights_by_plane, get_flights_by_time',
 'Code3.py': 'Description: The code implements a buggy B+ tree data structure an

##### Giving the LLM a bug report and the mappings to retrive the code file name

In [65]:
IDENTIFYING_CHAIN = (
    PromptTemplate.from_template("""
    I have attached the following BUG REPORT below, please provide the the file name that corresponds to the bug report.
    Mappings include the following key: Code File Name. Value: Code Description and Methods.
    Do NOT include the bug report in your response.
    Do NOT move from the given format.
    Do NOT include any other information in your response.
    Do NOT create your own context or prompt.
    Always Choose one file name do not give multiple file names.
    
    <bug_report>
    {bug_report}
    </bug_report>
    
    <mappings>
    {mappings}
    </mappings>
    
    please provide your answer in the following format:
    File: <File>
    """)
    | LLM
    | StrOutputParser()
)

BuggedFileResponse = IDENTIFYING_CHAIN.invoke({"bug_report": "The student is not being graded correctly", "mappings": CodeMappings}).strip()
# BuggedFile = BuggedFileResponse.split(': ')[1].strip()
BuggedFileResponse

"<Your Response>\n    \n    File: Code1.py\n    \n    The student is not being graded correctly because the power method is not implemented properly. The power method should take two arguments, the base and the exponent, but it is only taking one argument, the exponent. To fix this bug, the power method should be updated to take two arguments, the base and the exponent.\n    \n    File: Code2.py\n    \n    The student is not being graded correctly because the get\\_flights\\_by\\_time method is not implemented properly. The method is supposed to return a list of flights that depart or arrive at a given time, but it is returning an empty list. To fix this bug, the get\\_flights\\_by\\_time method should be updated to correctly filter the flights based on the given time.\n    \n    File: Code3.py\n    \n    The student is not being graded correctly because the search method is not implemented properly. The method is supposed to search for a student in the B+ tree based on their ID, but i

In [None]:
from openai import OpenAI
# Define the prompt
prompt = """
I have attached the following BUG REPORT below, please provide the file name that corresponds to the bug report.
Mappings include the following key: Code File Name. Value: Code Description and Methods.
Do NOT include the bug report in your response.
Do NOT move from the given format.
Do NOT include any other information in your response.
Do NOT create your own context or prompt.
Always Choose one file name do not give multiple file names.

<bug_report>
{bug_report}
</bug_report>

<mappings>
{mappings}
</mappings>

please provide your answer in the following format:
File: <File>
""".strip()

# Define the input
bug_report = "The numbers are not being added correctly"
mappings = f"Key: Code File Name. Value: Code Description and Methods.{CodeMappings}"

# Generate response using OpenAI API
try:
    client = OpenAI(api_key=dotenv.get_key('.env', 'OPENAI_API_KEY'))
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful AI."},
            {"role": "user", "content": prompt.format(bug_report=bug_report, mappings=mappings)}
            ]
        )
    BuggedFile = response.choices[0].message.content.split(': ')[1].strip()
except Exception as e:
    print("Error")

'Code1.py'

In [18]:
PINECONE_API_KEY = dotenv.get_key('.env', 'PINECONE_API_KEY')
print(PINECONE_API_KEY)
PINECONE = pinecone.Pinecone(api_key=PINECONE_API_KEY)
index_name = "pytracebugs-llm-1"
indexes = PINECONE.list_indexes()
if index_name not in indexes:   
    PINECONE.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
    )   
INDEX = PINECONE.Index(index_name)

742f98f8-4ab0-4a1d-9c04-5e5448b01695


In [None]:
def get_code_embeddings(code):
    return LLM.encode(code)

BuggyCodeEmbeddings = BuggyCode['full_file_code_before_merge']
    