In [1]:
# !pip install --upgrade openai



In [2]:
import os
import re
# import openai
import glob
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
def clean_html(raw_html):
    clean_text = re.sub('<.*?>', '', raw_html)
    clean_text = clean_text.replace('&lt;', '<').replace('&gt;', '>').replace('&nbsp;', ' ').replace('&amp;', '&')
    return clean_text

def read_file_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def extract_problem_statement(readme_content):
    match = re.search(r'</h3>(.+?)<p>&nbsp;</p>', readme_content, re.DOTALL)
    if match:
        return clean_html(match.group(1).strip())
    else:
        return "Problem statement not found."

def find_readme_file(java_file_name, problems_directory):
    # The folder name problems_directory/algorithm/Firstcharactor/filename/Readme.md
    folder_name = java_file_name.rsplit('.', 1)[0]
    index_foler_name = folder_name[0]
    if(index_foler_name.isdigit()):
        index_foler_name = "0-9"
    possible_readme_path = os.path.join(problems_directory,index_foler_name, folder_name, 'README.md')
    print(possible_readme_path)
    if os.path.isfile(possible_readme_path):
        return possible_readme_path
    else:
        return None

def find_corresponding_readme(java_file_path, problems_directory='solutions_correct/algorithms'):
    # Extract the java file name from the provided file path
    java_file_name = os.path.basename(java_file_path)
    # Determine the first character of the file name to decide the subfolder
    first_char = java_file_name[0]
    if first_char.isdigit():
        first_char = "0-9"
    else:
        first_char = first_char.upper()
    
    # The folder name is the Java file name without the .java extension
    folder_name = java_file_name.rsplit('.', 1)[0]
    # Construct the path to the expected README.md file
    readme_file_path = os.path.join(problems_directory, first_char, folder_name, 'README.md')
    
    if os.path.isfile(readme_file_path):
        return extract_problem_statement(read_file_content(readme_file_path))
    else:
        return None

In [3]:
def debug_code_with_gpt(problem_discription, code_content, API_KEY, few_shot, CoT):
    # problem discription is already trimmed to text only (no format, insert it to where needed)
    if few_shot == '0' and CoT == 'n':
        prompt = "The provided Java code may be buggy. Fix the bug if one exists, " +\
        "using minimal changes. Do not reorganize. Do not optimize. " +\
        "Do not provide explanation or justification. Format your code in markdown.\n" +\
        "```java\n" + code_content + "\n```"
    elif few_shot == '0' and CoT == 'c':
        prompt = "The provided Java code may be buggy. Review the Java code and identify the bug " +\
        "type if one exists. Explain the reasoning process, thinking step-by-step, for identifying " +\
        "and fixing the bug. Apply the fix using minimal changes. Do not reorganize or optimize the code. " +\
        "Format your code in markdown.\n```java\n" + code_content + "\n```"
    elif few_shot == 'f' and CoT == 'n': #IMPORTANT! DO NOT CHANGE THE INDENTATION BELOW!
        prompt = "The provided Java code may be buggy. Review the examples followed by the buggy code. " +\
        "Fix the bug, using minimal changes. Do not reorganize. Do not optimize. Do not provide " +\
        "explanation or justification. Format your code in markdown.\n\
Example #1:\n\
```java\n\
class Solution {\n\
    public int findMax(int[] nums) {\n\
        int max = nums[0];\n\
        for (int i = 1; i <= nums.length; i++) {\n\
            if (nums[i] > max) {\n\
                max = nums[i];\n\
            }\n\
        }\n\
        return max;\n\
    }\n\
}\n\
```\n\n\
Example Fix #1:\n\
```java\n\
class Solution {\n\
    public int findMax(int[] nums) {\n\
        int max = nums[0];\n\
        for (int i = 1; i < nums.length; i++) {\n\
            if (nums[i] > max) {\n\
                max = nums[i];\n\
            }\n\
        }\n\
        return max;\n\
    }\n\
}\n\
```\n\n\
Example #2:\n\
```java\n\
class Solution {\n\
    public String repeatChar(char c, int times) {\n\
        String result = \"\";\n\
        for (int i = 0; i < times; i++)\n\
            result += c\n\
        return result;\n\
    }\n\
}\n\
```\n\n\
Example Fix #2:\n\
```java\n\
class Solution {\n\
    public String repeatChar(char c, int times) {\n\
        String result = \"\";\n\
        for (int i = 0; i < times; i++)\n\
            result += c;\n\
        return result;\n\
    }\n\
}\n\
```\n\n\
Example #3:\n\
```java\n\
class Solution {\n\
    public int multiply(int a, int b) {\n\
        return a + b;\n\
    }\n\
}\n\
```\n\n\
Example Fix #3:\n\
```java\n\
class Solution {\n\
    public int multiply(int a, int b) {\n\
        return a * b;\n\
    }\n\
}\n\
```\n\n\
Example #4:\n\
```java\n\
class Solution {\n\
    public boolean isEven(int num) {\n\
        return num % 2 == 1;\n\
    }\n\
}\n\
```\n\n\
Example Fix #4:\n\
```java\n\
class Solution {\n\
    public boolean isEven(int num) {\n\
        return num % 2 == 0;\n\
    }\n\
}\n\
```\n\n\
Example #5:\n\
```java\n\
class Solution {\n\
    public boolean isLeapYear(int year) {\n\
        if ((year % 4 == 0) && (year % 100 != 0) || (year % 400 != 0)) {\n\
            return true;\n\
        }\n\
        return false;\n\
    }\n\
}\n\
```\n\n\
Example Fix #5:\n\
```java\n\
class Solution {\n\
    public boolean isLeapYear(int year) {\n\
        if ((year % 4 == 0) && (year % 100 != 0) || (year % 400 == 0)) {\n\
            return true;\n\
        }\n\
        return false;\n\
    }\n\
}\n\
```\n\n\
Buggy Code:\n\
```java\n" + code_content + "\n```" 
    elif few_shot == 'f' and CoT == 'c':
        prompt = "The provided Java code may be buggy. Review the examples followed " +\
        "by the provided buggy Java code, then identify the bug type. " +\
        "Explain the reasoning process for identifying and fixing the bug. "+\
        "Apply the fix using minimal changes. " +\
        "Do not reorganize or optimize the code. Format the final fixed code in markdown in its entirety.\n\
Example #1:\n\
```java\n\
class Solution {\n\
    public int findMax(int[] nums) {\n\
        int max = nums[0];\n\
        for (int i = 1; i <= nums.length; i++) {\n\
            if (nums[i] > max) {\n\
                max = nums[i];\n\
            }\n\
        }\n\
        return max;\n\
    }\n\
}\n\
```\n\n\
Example Explanation #1: \n\
The original code causes an `ArrayIndexOutOfBoundsException` due to the loop condition `i <= nums.length`, \
which attempts to access an index out of the array's bounds. In Java, array indices range from 0 to `length - 1`. \
The fix is changing the loop condition to `i < nums.length`, ensuring the loop iterates only \
within the array's valid range.\n\n\
Example Fix #1:\n\
```java\n\
class Solution {\n\
    public int findMax(int[] nums) {\n\
        int max = nums[0];\n\
        for (int i = 1; i < nums.length; i++) {\n\
            if (nums[i] > max) {\n\
                max = nums[i];\n\
            }\n\
        }\n\
        return max;\n\
    }\n\
}\n\
```\n\n\
Example #2:\n\
```java\n\
class Solution {\n\
    public String repeatChar(char c, int times) {\n\
        String result = \"\";\n\
        for (int i = 0; i < times; i++)\n\
            result += c\n\
        return result;\n\
    }\n\
}\n\
```\n\n\
Example Explanation #2:\n\
The error here is a missing semicolon (`;`) at the end of the statement inside the loop. \
Java requires each statement to end with a semicolon. Adding a semicolon at the end \
of `result += c` corrects this syntax error.\n\n\
Example Fix #2:\n\
```java\n\
class Solution {\n\
    public String repeatChar(char c, int times) {\n\
        String result = \"\";\n\
        for (int i = 0; i < times; i++)\n\
            result += c;\n\
        return result;\n\
    }\n\
}\n\
```\n\n\
Example #3:\n\
```java\n\
class Solution {\n\
    public int multiply(int a, int b) {\n\
        return a + b;\n\
    }\n\
}\n\
```\n\n\
Example Explanation #3:\n\
The method named `multiply` incorrectly performs addition (`return a + b;`), a logical error. \
The fix is to replace the addition operation with multiplication (`return a * b;`), aligning the \
operation with the method's purpose.\n\n\
Example Fix #3:\n\
```java\n\
class Solution {\n\
    public int multiply(int a, int b) {\n\
        return a * b;\n\
    }\n\
}\n\
```\n\n\
Example #4:\n\
```java\n\
class Solution {\n\
    public boolean isEven(int num) {\n\
        return num % 2 == 1;\n\
    }\n\
}\n\
```\n\n\
Example Explanation #4:\n\
The `isEven` method incorrectly checks for odd numbers (`num % 2 == 1`). \
The fix is changing the condition to `num % 2 == 0`, correctly identifying even numbers.\n\n\
Example Fix #4:\n\
```java\n\
class Solution {\n\
    public boolean isEven(int num) {\n\
        return num % 2 == 0;\n\
    }\n\
}\n\
```\n\n\
Example #5:\n\
```java\n\
class Solution {\n\
    public boolean isLeapYear(int year) {\n\
        if ((year % 4 == 0) && (year % 100 != 0) || (year % 400 != 0)) {\n\
            return true;\n\
        }\n\
        return false;\n\
    }\n\
}\n\
```\n\n\
Example Explanation #5:\n\
The `isLeapYear` method has an incorrect implementation of the leap year condition. \
The original condition erroneously includes non-leap years. \
The correct leap year condition is a year being divisible by 4 and not by 100, \
unless it's also divisible by 400. The fix adjusts the condition to \
`(year % 4 == 0) && (year % 100 != 0) || (year % 400 == 0)`, correctly identifying leap years.\n\n\
Example Fix #5:\n\
```java\n\
class Solution {\n\
    public boolean isLeapYear(int year) {\n\
        if ((year % 4 == 0) && (year % 100 != 0) || (year % 400 == 0)) {\n\
            return true;\n\
        }\n\
        return false;\n\
    }\n\
}\n\
```\n\n\
Buggy Code:\n\
```java\n" + code_content + "\n```" 
    else:
        raise ValueError("few_shot should be '0' or 'f', CoT should be 'n' or 'c'.")
    try:
        client = openai.OpenAI(api_key=API_KEY)
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": f"{prompt}"}
            ],
            max_tokens=1024  # Adjust as needed
        )
        output = response.choices[0].message.content
#         lines = output.split('\n')
#         # Remove the first and last lines
#         modified_content = '\n'.join(lines[1:-1])
#         return modified_content
        return output
    except Exception as e:
        print("Error during API call:", e)
        return None

def process_file(file_path, target_directory, api_key, few_shot, CoT):
    with open(file_path, 'r') as file:
        file_content = file.read()

    problem_discription = find_corresponding_readme(file_path)    
# add problem discription here

    debugged_content = debug_code_with_gpt(problem_discription, file_content, api_key, few_shot, CoT)

    if debugged_content:
        target_file_path = os.path.join(target_directory, os.path.basename(file_path))
        with open(target_file_path, 'w') as file:
            file.write(debugged_content)
#         print(f"Debugged code written to {target_file_path}")

def process_files_in_parallel(source_directory, target_directory, api_key, few_shot, CoT, max_workers=30):
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    file_paths = glob.glob(os.path.join(source_directory, '*.java'))
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_file, file_path, target_directory, api_key, few_shot, CoT)\
                   for file_path in file_paths]
        for future in tqdm(as_completed(futures), total=len(file_paths)):
            future.result()  # This will re-raise any exception caught during process_file execution

In [4]:
def main(few_shot, CoT):
    with open('API_KEY.txt', 'r') as file:
        api_key = file.read().strip()
    source_dir = 'data/formatted/buggy_codes_GPT'  # Source directory
    target_dir = f'data/formatted/debugged_GPTBuggyCodes_{few_shot}_{CoT}'  # Target directory

    process_files_in_parallel(source_dir, target_dir, api_key, few_shot, CoT)

In [5]:
if __name__ == "__main__":
    # few_shot: '0' (zero-shot) or 'f' (few-shot)
    # CoT: 'n' (no Chain-of-Thought) or 'c' (with Chain-of-Thought)
    
    # RUN ALL COMBINATIONS
    few_shot = '0'
    CoT = 'n'
    main(few_shot, CoT)
    
    few_shot = '0'
    CoT = 'c'
    main(few_shot, CoT)
    
    few_shot = 'f'
    CoT = 'n'
    main(few_shot, CoT)
    
    few_shot = 'f'
    CoT = 'c'
    main(few_shot, CoT)

100%|█████████████████████████████████████████| 199/199 [13:19<00:00,  4.02s/it]
100%|█████████████████████████████████████████| 199/199 [14:24<00:00,  4.34s/it]
100%|█████████████████████████████████████████| 199/199 [13:47<00:00,  4.16s/it]
100%|█████████████████████████████████████████| 199/199 [38:49<00:00, 11.71s/it]
