In [None]:
import openai
import pandas as pd
import os
import re
import time

# Load environment variables
OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")

# Set OpenAI API Key
openai.api_key = OPEN_AI_API_KEY

# Directories
input_dir = '../Data/new_1000_record_chunks/'  # Input folder with chunk files
output_dir = '../Data/new_100k_dataset/gpt/'  # Output folder to save explanations
os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

# sample_size = 1  # Sample size for each chunk

def retry_request(func, max_retries=5, delay=2):
    """
    Retry mechanism for API requests with exponential backoff.
    """
    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            else:
                raise Exception("All retries failed") from e

def get_gpt_explanation(buggy_code, fixed_code):
    """
    Function to get a concise explanation using OpenAI's GPT model with retry logic.
    """
    def make_request():
        prompt = f"""You are a senior software engineer explaining a bug fix to a junior developer. 
        Your task is to provide a concise explanation of the changes made to fix the bug in the provided code snippets. Follow these guidelines:

        ### Guidelines:
        1. **Bug Identification**: Describe the error in the original code, including its type (e.g., logic error, runtime error) and its impact.
        2. **Problem Analysis**: Explain why the bug is problematic and under what conditions it causes issues.
        3. **Fix Explanation**: Describe the changes in the fixed code and how they address the issue.
        4. **Justification**: Justify why the fix is necessary and how it resolves the problem.
        5. **Improvement Highlight**: Summarize how the fix improves code reliability, functionality, or performance.

        ### Constraints:
        - The explanation must be **concise**, using no more than **100 words**.
        - The explanation should consist of **exactly three sentences**:
        1. Why the buggy code is incorrect.
        2. What changes were made in the fixed code and why they are correct.
        3. How the fixed code improves upon the buggy code.
        - Do not include meta-descriptions such as "This is a three-sentence explanation" or mention word counts in the response.
        - Ensure your explanation aligns with the code context and focuses solely on technical details relevant to the fix.

        ### Example Explanations:

        #### Example 1:
        **Buggy Code:**
        @Override protected void afterTests(){{
            try {{
                context.shutdown();
            }}
            catch (Exception e) {{
                throw new RuntimeException("String_Node_Str", e);
            }}
            super.afterTests();
        }}

        **Fixed Code:**
        @Override protected void afterTests(){{
            try {{
                context.shutdown();
            }}
            catch (Exception e) {{
                throw new RuntimeException("String_Node_Str", e);
            }}
        }}

        **Explanation:**
        The bug in the original code is the unconditional call to `super.afterTests()`, which executes even if `context.shutdown()` fails, risking inconsistent state. 
        The fixed code removes this call, ensuring `super.afterTests()` is not invoked when an exception occurs, preventing potential errors. 
        This fix ensures predictable cleanup behavior, improving code reliability.

        #### Example 2:
        **Buggy Code:**
        private void updateTreeView(Tree tree){{
            Iterator it = tree.getDepthFirstIterator(false);
            while (it.hasNext()) {{
                ((Tree<JsonTreeNode>) it.next()).setExpanded(true);
            }}
            editorTreeView.setModel(tree.copy());
        }}

        **Fixed Code:**
        private void updateTreeView(JsonTree tree){{
            JsonTree fixedTree = JsonTreeConverter.serialize(JsonTreeConverter.deserialize(tree));
            Iterator it = fixedTree.getDepthFirstIterator(false);
            while (it.hasNext()) {{
                ((JsonTree) it.next()).setExpanded(true);
            }}
            editorTreeView.setModel(fixedTree.copy());
        }}

        **Explanation:**
        The original code has a bug where it improperly casts a generic Tree to `Tree<JsonTreeNode>`, which can cause runtime errors if the types don’t match. 
        The fix uses a `JsonTree` with serialization and deserialization to ensure the tree structure is correct and safe to work with. 
        This makes the code more reliable and prevents runtime type errors.

        ---

        ### Your Task:

        **Buggy Code:**
        {buggy_code}

        **Fixed Code:**
        {fixed_code}

        **Why the fixed code is correct:**"""

        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that explains bug fixes clearly."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=200,
            temperature=0.7
        )
        return response['choices'][0]['message']['content'].strip()

    return retry_request(make_request)

def natural_sort_key(file_name):
    """
    Extract numeric parts of file names for natural sorting.
    Example: '1000_chunk_10.csv' -> [10]
    """
    return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', file_name)]

# Process all chunk files
# files = [file for file in os.listdir(input_dir) if file.endswith('.csv')]
# files.sort(key=natural_sort_key)  # Sort files in natural order

files = [f"1000_chunk_{i}.csv" for i in range(1, 100)]
# files.sort(key=natural_sort_key)  # Sort files in natural order

for file in files:
    input_path = os.path.join(input_dir, file)
    output_file_name = file.replace(".csv", "_only_gpt.csv")
    output_path = os.path.join(output_dir, output_file_name)

    print(f"Processing {file}...")

    # Load the data
    df = pd.read_csv(input_path)

    # df = df.sample(min(sample_size, len(df)))  # Take a sample of the data

    # Apply explanation generation
    df['gpt_explanation'] = df.apply(lambda row: get_gpt_explanation(row['buggy_code'], row['fixed_code']), axis=1)

    # Save the output
    df.to_csv(output_path, index=False)
    print(f"Explanations saved to {output_path}")

print("Processing complete for all chunk files!")


Processing 1000_chunk_1.csv...
Explanations saved to ../Data/new_100k_dataset/gpt/1000_chunk_1_only_gpt.csv
Processing 1000_chunk_2.csv...
Explanations saved to ../Data/new_100k_dataset/gpt/1000_chunk_2_only_gpt.csv
Processing 1000_chunk_3.csv...
Explanations saved to ../Data/new_100k_dataset/gpt/1000_chunk_3_only_gpt.csv
Processing 1000_chunk_4.csv...
Explanations saved to ../Data/new_100k_dataset/gpt/1000_chunk_4_only_gpt.csv
Processing 1000_chunk_5.csv...
Explanations saved to ../Data/new_100k_dataset/gpt/1000_chunk_5_only_gpt.csv
Processing 1000_chunk_6.csv...
Attempt 1 failed: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Attempt 2 failed: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002477F7B8D40>: Failed to resolve 'api.openai

In [9]:
import os
import pandas as pd
import anthropic
import time

# Load API key from environment variables
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
client = anthropic.Client(api_key=ANTHROPIC_API_KEY)

# Directories
input_dir = '../Data/new_1000_record_chunks/'  # Input folder
output_dir = '../Data/new_100k_dataset/anthropic/'  # Output folder
os.makedirs(output_dir, exist_ok=True)

# Static prompt (sent once per session)
STATIC_PROMPT = """You are a senior software engineer explaining a bug fix to a junior developer. 
        Your task is to provide a concise explanation of the changes made to fix the bug in the provided code snippets. Follow these guidelines:

        ### Guidelines:
        1. **Bug Identification**: Describe the error in the original code, including its type (e.g., logic error, runtime error) and its impact.
        2. **Problem Analysis**: Explain why the bug is problematic and under what conditions it causes issues.
        3. **Fix Explanation**: Describe the changes in the fixed code and how they address the issue.
        4. **Justification**: Justify why the fix is necessary and how it resolves the problem.
        5. **Improvement Highlight**: Summarize how the fix improves code reliability, functionality, or performance.

        ### Constraints:
        - The explanation must be **concise**, using no more than **100 words**.
        - The explanation should consist of **exactly three sentences**:
        1. Why the buggy code is incorrect.
        2. What changes were made in the fixed code and why they are correct.
        3. How the fixed code improves upon the buggy code.
        - Do not include meta-descriptions such as "This is a three-sentence explanation" or mention word counts in the response.
        - Ensure your explanation aligns with the code context and focuses solely on technical details relevant to the fix.

        ### Example Explanations:

        #### Example 1:
        **Buggy Code:**
        @Override protected void afterTests(){{
            try {{
                context.shutdown();
            }}
            catch (Exception e) {{
                throw new RuntimeException("String_Node_Str", e);
            }}
            super.afterTests();
        }}

        **Fixed Code:**
        @Override protected void afterTests(){{
            try {{
                context.shutdown();
            }}
            catch (Exception e) {{
                throw new RuntimeException("String_Node_Str", e);
            }}
        }}

        **Explanation:**
        The bug in the original code is the unconditional call to `super.afterTests()`, which executes even if `context.shutdown()` fails, risking inconsistent state. 
        The fixed code removes this call, ensuring `super.afterTests()` is not invoked when an exception occurs, preventing potential errors. 
        This fix ensures predictable cleanup behavior, improving code reliability.

        #### Example 2:
        **Buggy Code:**
        private void updateTreeView(Tree tree){{
            Iterator it = tree.getDepthFirstIterator(false);
            while (it.hasNext()) {{
                ((Tree<JsonTreeNode>) it.next()).setExpanded(true);
            }}
            editorTreeView.setModel(tree.copy());
        }}

        **Fixed Code:**
        private void updateTreeView(JsonTree tree){{
            JsonTree fixedTree = JsonTreeConverter.serialize(JsonTreeConverter.deserialize(tree));
            Iterator it = fixedTree.getDepthFirstIterator(false);
            while (it.hasNext()) {{
                ((JsonTree) it.next()).setExpanded(true);
            }}
            editorTreeView.setModel(fixedTree.copy());
        }}

        **Explanation:**
        The original code has a bug where it improperly casts a generic Tree to `Tree<JsonTreeNode>`, which can cause runtime errors if the types don’t match. 
        The fix uses a `JsonTree` with serialization and deserialization to ensure the tree structure is correct and safe to work with. 
        This makes the code more reliable and prevents runtime type errors."""

# Initialize conversation with the static prompt
def initialize_conversation():
    print("Initializing conversation with the static prompt...")
    client.messages.create(
        model="claude-3-5-haiku-latest",
        max_tokens=200,  # Add max_tokens (required)
        messages=[{"role": "system", "content": STATIC_PROMPT}]
    )
    print("Conversation initialized.")


# Generate explanation for a specific buggy and fixed code pair
def get_explanation_dynamic(buggy_code, fixed_code):
    dynamic_prompt = f"""
### Your Task:

**Buggy Code:**
{buggy_code}

**Fixed Code:**
{fixed_code}

**Why the fixed code is correct:**"""
    
    # Make the API call with only the dynamic content
    response = client.messages.create(
        model="claude-3-5-haiku-latest",
        max_tokens=200,
        temperature=0.5,
        messages=[{"role": "user", "content": dynamic_prompt}]
    )
    return response.content[0].text.strip()

# Retry mechanism for robustness
def retry_request(func, max_retries=5, delay=2):
    """
    Retry mechanism for API requests with exponential backoff.
    """
    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            else:
                raise Exception("All retries failed") from e

# Process all evaluation files
def process_files():
    files = [f"1000_chunk_{i}.csv" for i in range(25, 100)]
    
    initialize_conversation()  # Send the static prompt once
    
    for file in files:
        input_path = os.path.join(input_dir, file)
        output_file_name = file.replace(".csv", "_only_anthropic.csv")
        output_path = os.path.join(output_dir, output_file_name)

        print(f"Processing {file}...")
        
        # Load the input data
        df = pd.read_csv(input_path)

        # Generate explanations for each row
        df['anthropic_explanation'] = df.apply(
            lambda row: retry_request(
                lambda: get_explanation_dynamic(row['buggy_code'], row['fixed_code'])
            ),
            axis=1
        )
        
        # Save the output
        df.to_csv(output_path, index=False)
        print(f"Explanations saved to {output_path}")

    print("Processing complete for all evaluation files!")

# Main execution
if __name__ == "__main__":
    process_files()


Initializing conversation with the static prompt...


BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}}

In [15]:
import pandas as pd
import os
import re

# Directories
input_dir = '../Data/100k_dataset/anthropic/'  # Folder with individual CSV files
output_file = '../Data/100k_dataset/combined_gpt_anthropic_explanations_without_similarity.csv'  # Final combined output


def natural_sort_key(file_name):
    """
    Extract numeric parts of file names for natural sorting.
    Example: '1000_chunk_10_with_gpt_and_anthropic.csv' -> [10]
    """
    return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', file_name)]

# Find and sort all CSV files
files = [file for file in os.listdir(input_dir) if file.endswith('_anthropic.csv')]
files.sort(key=natural_sort_key)  # Sort files in natural order

# Combine all files into one DataFrame
combined_df = pd.DataFrame()

for file in files:
    input_path = os.path.join(input_dir, file)
    print(f"Reading {file}...")
    df = pd.read_csv(input_path)
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Save the combined DataFrame to a single CSV
combined_df.to_csv(output_file, index=False)
print(f"Combined CSV saved to {output_file}")

Reading 1000_chunk_1_only_gpt_4-0_mini_with_gpt_and_anthropic.csv...
Reading 1000_chunk_2_only_gpt_4-0_mini_with_gpt_and_anthropic.csv...
Reading 1000_chunk_3_only_gpt_4-0_mini_with_gpt_and_anthropic.csv...
Reading 1000_chunk_4_only_gpt_4-0_mini_with_gpt_and_anthropic.csv...
Reading 1000_chunk_5_only_gpt_4-0_mini_with_gpt_and_anthropic.csv...
Combined CSV saved to ../Data/100k_dataset/combined_gpt_anthropic_explanations_without_similarity.csv


In [16]:
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# sample_size = "100"

df = pd.read_csv(f'../Data/100k_dataset/combined_gpt_anthropic_explanations_without_similarity.csv')  # Replace with the actual path to your dataframe
# Function to filter sentences from the text

def calculate_similarity(text1, text2):
    """
    Calculate cosine similarity between two pieces of text using SentenceTransformer.
    """
    # Load the pre-trained model from SentenceTransformers
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Encode the two texts
    embeddings = model.encode([text1, text2])

    # Calculate cosine similarity
    similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    
    return similarity_score

# Apply the function to the 'anthropic_explanation' column
df['explanation_similarity_score'] = df.apply(lambda row: calculate_similarity(row['gpt_explanation'], row['anthropic_explanation']), axis=1)

df.to_csv(f'../Data/100k_dataset/combined_gpt_anthropic_explanations_with_similarity.csv', index=False)

  from tqdm.autonotebook import tqdm, trange
