In [None]:
!pip install --upgrade openai anthropic google-generativeai ipywidgets
import pandas as pd
import matplotlib.pyplot as plt
import time
import psutil
import os
import pickle
import tracemalloc
from collections import Counter
from difflib import SequenceMatcher
import difflib
import re
import ast
from tqdm import tqdm
from openai import OpenAI
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import openai
import anthropic
import google.generativeai as genai
import ipywidgets as widgets
from IPython.display import display, Markdown

from spider_utils_py import load_csv_database

In [None]:
# model endpoints
os.environ["OPENAI_API_KEY"] = ""
os.environ["ANTHROPIC_API_KEY"] = ""
os.environ["GEMINI_API_KEY"] = ""

In [None]:
def process_first_n_rows(input_data):
    """
    Processes the input dictionary and creates a DataFrame from the 'first_n_rows' JSON string.

    Args:
        input_data (dict): A dictionary containing the 'first_n_rows' key with a JSON string.

    Returns:
        pd.DataFrame: A pandas DataFrame created from the 'first_n_rows' JSON string.
    """
    try:
        # Extract the 'first_n_rows' JSON string
        first_n_rows_json = input_data.get('first_n_rows', '')

        # Fix improperly escaped backslashes
        first_n_rows_json = first_n_rows_json.replace("\\/", "/")  # Fix escaped forward slashes
        first_n_rows_json = first_n_rows_json.replace("\\", "\\\\")  # Properly escape backslashes
        
        # Ensure the string is properly escaped for JSON parsing
        first_n_rows_json = first_n_rows_json.replace('None', 'null')  # Replace Python None with JSON null

        # Parse the JSON string into a Python object
        first_n_rows_data = json.loads(first_n_rows_json)

        # Convert the parsed data into a pandas DataFrame
        first_n_rows_df = pd.DataFrame(first_n_rows_data)

        return first_n_rows_df

    except (json.JSONDecodeError, ValueError) as e:
        print(f"Error processing 'first_n_rows': {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of an error
def clean_code_markers(code_string):
    """
    Removes ```python and ``` markers from a code string.
    
    Args:
        code_string (str): The input string containing code with markers
        
    Returns:
        str: Cleaned code without the markers
    """
    # Remove ```python at the start (with optional whitespace)
    cleaned = code_string.replace('```python', '').strip()
    
    # Remove ``` at the end (with optional whitespace)
    cleaned = cleaned.replace('```', '').strip()

    return cleaned

def execute_intent_code(exec_state, code, num_output_rows=10, verbose=False):
    """
    Executes the given code in the provided execution state.
    Returns the updated execution state and any outputs, capturing only primitive types, tuples, 
    and DataFrames (DataFrames are stored in JSON format).

    exec_state: python exec namespace

    examples:

        for executing notebook header:
        
            first_n_rows = pd.DataFrame(eval(eval(intents.iloc[0][INPUT_DATA_COL].replace('null', 'None'))['first_n_rows']))
            exec_state = {"pd": pd, "first_n_rows": first_n_rows}  # Initialize execution state
            try:
                outputs, exec_state = execute_intent_code(exec_state, nb_header, verbose=False)
                inputs = outputs  # Initialize inputs with the header execution outputs
            except Exception as e:
                print(f"Error executing notebook header for {nb_name}: {e}")
                continue  # Skip this notebook if the header fails

        for executing intent code (note exec_state would have been previously modified from previous intent code execution)
            # Execute original code
            try:
                print("Executing original code...")
                original_outputs, exec_state = execute_intent_code(exec_state, actual_code, verbose=False)
            except Exception as e:
                print(f"Error executing original code: {e}")
                original_outputs = {}
        
    """
    try:
        # Use a non-interactive backend for matplotlib to suppress plots
        plt.switch_backend('Agg')

        if verbose:
            print("IN STATE")
            print(exec_state)
            print("CODE")
            print(code)

        # Execute the code in the provided execution state
        exec(code, exec_state)

        # Clear any matplotlib figures created during execution
        plt.close('all')
        
        # Capture the outputs (all variables in the execution state)
        outputs = {}
        for key, value in exec_state.items():
            if not key.startswith("__"):
                if isinstance(value, (int, float, str, bool, tuple)):
                    outputs[key] = value
                elif isinstance(value, pd.DataFrame):
                    # Convert DataFrame to JSON format
                    outputs[key] = str(value.head(num_output_rows).to_json(orient="records"))
                elif isinstance(value, pd.Series):
                    # Convert Series to JSON format
                    outputs[key] = str(value.head(num_output_rows).to_json())

        if verbose:
            print("OUT STATE")
            print(exec_state)
            print("OUTPUTS")
            print(outputs)

    except Exception as e:
        print("Error in executing code: ", e)
        outputs = {"error": str(e)}
    
    return outputs, exec_state

def load_from_pkl(file_path):
    # Load the pickled data (list of dictionaries)
    with open(file_path, 'rb') as f:
        extracted_data = pickle.load(f)
    
    # Convert the list of dictionaries to a DataFrame
    df_loaded = pd.DataFrame(extracted_data)
    
    # Verify the data loaded correctly
    print(f"Loaded DataFrame with shape: {df_loaded.shape}")
    print(f"Columns: {df_loaded.columns.tolist()}")
    
    # Check a sample of the code to ensure formatting is preserved
    if len(df_loaded) > 0:
        print("\nSample code from first row:")
        print("-" * 50)
        print(df_loaded.iloc[0]['code'])
        print("-" * 50)
    return df_loaded

In [None]:
# model endpoints
# OpenAI
openai.api_key = os.getenv("OPENAI_API_KEY")

# Anthropic
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# Gemini
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [None]:
def call_llm(provider, prompt, model=None, temperature=0.0, max_tokens=512):
    try:
        if provider.lower() == "openai":
            client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
            response = client.chat.completions.create(
                    model=model or "gpt-4o-mini",
                    store=False,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=temperature,
                    max_tokens=max_tokens
                )
            return response.choices[0].message.content

        elif provider.lower() == "anthropic":
            response = anthropic_client.messages.create(
                model=model or "claude-3-5-haiku-20241022",
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens,
            )
            return response.content[0].text

        elif provider.lower() == "google":
            gemini_model = genai.GenerativeModel(
                model_name=model or "gemini-2.0-flash",
                generation_config={
                    "temperature": temperature,
                    "top_p": 0.95,
                    "top_k": 40,
                    "max_output_tokens": max_tokens,
                    "response_mime_type": "text/plain",
                }
            )
            response = gemini_model.generate_content(prompt)
            time.sleep(2)
            return response.text

        else:
            return "Unknown provider."

    except Exception as e:
        return f"Error: {e}"


def chat_gpt_completion(model = "gpt-4o-mini", 
             role = "developer",
            prompt = "Transform Input to get Output. Input: (A, 1) (A, 2) Output: (A, 1, 2)",
            temp=None):    
    # Set your API key
    API_KEY = "API_KEY"
    print("Prompt:")
    print(prompt)
    client = OpenAI(api_key=API_KEY)
    if temp is None:
        completion = client.chat.completions.create(
          model=model,
          store=False,
          messages=[{"role": role, "content": prompt}])
    else:
        completion = client.chat.completions.create(
            model=model,
            store=False,
            messages=[{"role": role, "content": prompt}],
            temperature=temp)
        
    response = completion.choices[0].message.content
    print("Response:")
    print(response)
    print()
    return response

def deepseek_completion(
    model = "deepseek-chat", 
    role = "system",
    prompt = "Transform Input to get Output. Input: (A, 1) (A, 2) Output: (A, 1, 2)",
    temp=None):
    API_KEY = 'api_key'
    deepseek_client = OpenAI(
        api_key=API_KEY,
        base_url="https://api.deepseek.com/v1"
    )
    if temp is None:
        completion = deepseek_client.chat.completions.create(
            model=model,
            store=False,
             messages=[{"role": role, "content": prompt}])
    else:
        completion = deepseek_client.chat.completions.create(
            model="deepseek-chat",
            store=False,
             messages=[{"role": role, "content": prompt}],
            temperature=temp)
    response = completion.choices[0].message.content
    print("Response:")
    print(response)
    print()
    return response

def code_llama_completion():
    pass

def github_copilot_completion():
    pass

def code_llama_tuned_completion():
    pass


def code_llama_tuned_with_cot_iterative_completion():
    pass

In [None]:
BASELINE_MODELS = {'chat_gpt': chat_gpt_completion, 
                   'deepseek_chat': deepseek_completion}

TRANSFORMATION_PATTERNS = [
        r'\.merge\(', r'\.join\(', r'\.filter\(', r'\.groupby\(',
        r'\.concat\(', r'\.drop\(', r'\.loc\[',
        r'\.iloc\[', r'\.assign\(', r'\.rename\(',
        r'\.pivot\(', r'\.melt\(', r'\.sort_values\(',
        r'\.reset_index\(', r'\.set_index\(', r'\.query\(',
        r'\.drop_duplicates\(', r'\.fillna\(', r'\.replace\('
    ]


MODEL_COL = 'model'
ORIGINAL_NOTEBOOK_COL = 'nb_source'
ACTUAL_CODE_COL = 'code'
INPUT_DATA_COL = 'inputs'
OUTPUT_DATA_COL = 'outputs'
TRANSFORMATION_DESCRIPTION_COL = 'intent'
GENERATED_CODE_COL = 'gen_code'
GENERATED_INPUT_DATA_COL = 'gen_inputs'
GENERATED_OUTPUT_DATA_COL = 'gen_outputs'

In [None]:
# Stub function to invoke an LLM (to be replaced with actual model call)
def invoke_llm(model_completion_fn, description):
    """
    Calls a given LLM to generate code
    """
    return model_completion_fn(prompt = f"{description} \nOutput only Python code. Write the first 10 rows of the final resulting dataframe to a variable named 'output' using to_dict(orient='records').")


def filter_comments(code):
    """Remove comments from Python code."""
    return '\n'.join([line for line in code.split('\n') if not line.strip().startswith('#')])

def compare_code(generated_code, actual_code):
    '''Compares code using the diff library similar to version control'''
    generated_code = filter_comments(generated_code)
    actual_code = filter_comments(actual_code)
    
    generated_lines = generated_code.split('\n')
    actual_lines = actual_code.split('\n')

    diff = difflib.unified_diff(
        generated_lines, actual_lines,
        fromfile='Generated Code', tofile='Actual Code',
        lineterm=''
    )
    
    for line in diff:
        if line.startswith('---') or line.startswith('+++'):
            print(f'\033[1;34m{line}\033[0m')  # Blue header
        elif line.startswith('-'):
            print(f'\033[1;31m{line}\033[0m')  # Red for deletions
        elif line.startswith('+'):
            print(f'\033[1;32m{line}\033[0m')  # Green for additions
        else:
            print(line)  # Normal text

def clean_up_llm_gen_code(generated_code):
    '''Applies clean up to code generate by LLMs eg Markdown code ``` delimiters'''
    generated_code = generated_code.str.removeprefix("```python").str.lstrip()
    generated_code = generated_code.str.removesuffix("```").str.rstrip()
    return generated_code

# Function to calculate BLEU-3 score using simple n-gram matching
def calculate_bleu_3(reference, candidate):
    """
    Computes a simple BLEU-3-like score based on n-gram overlap.
    """
    def get_ngrams(text, n=3):
        tokens = text.split()
        return set([" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)])

    ref_ngrams = get_ngrams(reference, 3)
    cand_ngrams = get_ngrams(candidate, 3)

    if not ref_ngrams or not cand_ngrams:
        return 0.0

    overlap = len(ref_ngrams.intersection(cand_ngrams))
    return overlap / len(ref_ngrams)

def calculate_bleu_3_trans(trans1, trans2):
    """
    Computes the BLEU-3 score for the transformation sequences.
    Uses up to trigrams (n=3) for measuring sequence similarity.
    """
    if not trans1 or not trans2:  # Avoid BLEU error when input is empty
        return 0.0

    smooth_fn = SmoothingFunction().method1  # Smoothing to avoid zero scores

    return sentence_bleu(
        [trans1],  # Reference sequence (list of lists)
        trans2,  # Candidate sequence
        weights=(0.33, 0.33, 0.33),  # BLEU-3: Trigram weighting
        smoothing_function=smooth_fn)


# function to calculate percentage structure and data correctness
def compare_dataframes(df_actual, df_gen):

    if (df_gen is None) or (df_actual is None):
        return (0.0, 0.0)
    
    # Structural similarity: Check matching columns
    common_columns = set(df_gen.columns).intersection(set(df_actual.columns))
    structure_score = len(common_columns) / max(len(df_gen.columns), len(df_actual.columns))
    
    # Align data by reindexing to the same number of rows
    max_rows = max(len(df_gen), len(df_actual))
    df_gen = df_gen.reindex(range(max_rows))
    df_actual = df_actual.reindex(range(max_rows))
    
    # Content correctness: Count exact matches including missing values
    exact_matches = 0
    total_values = 0
    
    for col in common_columns:
        matches = (df_gen[col] == df_actual[col]) & df_gen[col].notna() & df_actual[col].notna()
        exact_matches += matches.sum()
        total_values += max(len(df_gen[col].dropna()), len(df_actual[col].dropna()))
    
    content_score = exact_matches / total_values if total_values > 0 else 0
    
    return (structure_score, content_score)

# Function to evaluate performance
def measure_performance(func, *args, **kwargs):
    """
    Measures execution time and memory usage of a function.
    """
    tracemalloc.start()
    start_time = time.time()
    process = psutil.Process(os.getpid())
    start_memory = process.memory_info().rss

    result = func(*args, **kwargs)

    end_time = time.time()
    peak_memory = tracemalloc.get_traced_memory()[1]  # Peak memory usage
    end_memory = process.memory_info().rss
    tracemalloc.stop()

    elapsed_time = end_time - start_time
    avg_memory = (start_memory + end_memory) / 2

    return result, elapsed_time, peak_memory, avg_memory

def extract_transformations_from_code(code):
    """
    Extracts pandas transformation operations from a Python script string.
    """
    transformation_patterns = TRANSFORMATION_PATTERNS

    transformations = []
    for pattern in transformation_patterns:
        matches = re.findall(pattern, code)
        transformations.extend(matches)

    return transformations

def count_unique_matches(trans1, trans2):
    """
    Counts unique matches for common transformations between two lists.
    Ensures that a transformation in one list is matched only once to another.
    """
    trans1_counts = Counter(trans1)
    trans2_counts = Counter(trans2)

    unique_match_count = 0

    for transformation in trans1_counts:
        if transformation in trans2_counts:
            # Match only the minimum occurrences in both lists (ensuring unique matches)
            unique_match_count += min(trans1_counts[transformation], trans2_counts[transformation])

    return unique_match_count

def compare_python_scripts(code1, code2):
    """
    Compares two Python scripts based on common data transformations
    and their sequence similarity.
    """
    trans1 = extract_transformations_from_code(code1)
    trans2 = extract_transformations_from_code(code2)

    # Count unique transformation matches
    unique_match_count = count_unique_matches(trans1, trans2)

    # Check sequence similarity
    seq_matcher = SequenceMatcher(None, trans1, trans2)
    sequence_similarity = seq_matcher.ratio()  # Between 0 and 1

    return {
        "unique_common_transformation_count": unique_match_count,
        "total_trans_code1": len(trans1),
        "total_trans_code2": len(trans2),
        "sequence_similarity": sequence_similarity,
        "trans_code1": trans1,
        "trans_code2": trans2,
        "trans_bleu3" : calculate_bleu_3_trans(trans1, trans2)       
    }

# Main evaluation function
def evaluate_completions(test_df):
    """
    Evaluates the LLM-generated code against expected outputs.
    """
    results = []
    for _, row in test_df.iterrows():
        model = row[MODEL_COL]
        input_data = row[INPUT_DATA_COL]
        description = row[TRANSFORMATION_DESCRIPTION_COL]
        actual_code = filter_comments(row[ACTUAL_CODE_COL])
        output_data = row[OUTPUT_DATA_COL]
        generated_code = filter_comments(row[GENERATED_CODE_COL])
        
        # Compute BLEU-3 score for overall code correctness
        bleu_score = calculate_bleu_3(actual_code, generated_code)

        # Execute generated code and capture output
        exec_globals = {}
        try:
            print("Executing: ", generated_code)
            result, elapsed_time, peak_memory, avg_memory = measure_performance(
                exec, generated_code, exec_globals)
            generated_output = exec_globals.get("output", None) # Assuming "output" variable holds the result
            print("Generated output: ", generated_output)
            #generated_output_list = ast.literal_eval(generated_output)
            generated_output_df = pd.DataFrame(generated_output)
        except Exception as e:
            print(e)
            generated_output_df = None
            elapsed_time, peak_memory, avg_memory = None, None, None

        # TODO: bind the original code to input and execute
        
        # Check output correctness
        print("Actual output:")
        print(output_data)
        # Convert string to list of dictionaries safely
        output_data_list = ast.literal_eval(output_data)
        output_df = pd.DataFrame(output_data_list)
        #output_correctness = check_output_correctness(output_df, generated_output_df)
        output_structure_score, output_data_score = compare_dataframes(output_df, generated_output_df)

        # Compare transformations
        trans_metrics = compare_python_scripts(row[GENERATED_CODE_COL], 
                               row[ACTUAL_CODE_COL])
        unique_match_count = trans_metrics["unique_common_transformation_count"]
        trans_count_generated = trans_metrics["total_trans_code1"]
        trans_count_actual = trans_metrics["total_trans_code2"]
        sequence_similarity = trans_metrics["sequence_similarity"]
        trans_generated = trans_metrics["trans_code1"]
        trans_actual = trans_metrics["trans_code2"]        
        trans_bleu_3 = trans_metrics["trans_bleu3"]
        
        results.append({
            "model": model,
            "actual_code": actual_code,
            "generated_code": generated_code,
            "output_data": output_data,
            "generated_output": generated_output,
            "bleu_3_exact_code": bleu_score,
            "count_common_trans": unique_match_count,
            "count_trans_generated": trans_count_generated,
            "count_trans_actual": trans_count_actual,
            "bleu_3_trans": trans_bleu_3,
            "trans_sequence_similarity": sequence_similarity,
            "trans_generated": trans_generated,
            "trans_actual": trans_actual,
            "output_structure_score": output_structure_score,
            'output_data_score': output_data_score,
            "elapsed_time": elapsed_time,
            "peak_memory": peak_memory,
            "avg_memory": avg_memory
        })

    return pd.DataFrame(results)


def run_models(test_df, models=BASELINE_MODELS):
    '''Generates the transformations for a test dataset for list of models'''
    # Initialize an empty DataFrame to store all predictions
    all_preds = pd.DataFrame()
    
    # Generate predictions using each model endpoint
    for m, ep in tqdm(models.items()):
        # Create a temporary DataFrame to store predictions for the current model
        temp_df = test_df.copy()
        temp_df[GENERATED_CODE_COL] = temp_df.progress_apply(
            lambda x: invoke_llm(ep, x[TRANSFORMATION_DESCRIPTION_COL]), axis=1)

        # clean up the gen code
        temp_df.loc[:,GENERATED_CODE_COL] = clean_up_llm_gen_code(temp_df[GENERATED_CODE_COL]) 

        # record model used for these predictions
        temp_df[MODEL_COL] = m
        
        # Concatenate the temporary DataFrame to all_preds
        all_preds = pd.concat([all_preds, temp_df], axis=0)
    
    # Assign the concatenated predictions back to test_df
    test_df = all_preds.reset_index(drop=True)
    return test_df

In [None]:
# # Temporary utility functions
# # can be removed after we fully integrate the pipeline
# import json
# import ast

# def extract_markdown_cells(notebook_json):
#     """
#     Extracts and combines all markdown cells from a Jupyter Notebook JSON into a single string.
    
#     Parameters:
#         notebook_json (dict or str): The JSON content of the notebook as a dictionary or string.
    
#     Returns:
#         str: A single string containing all markdown cells combined.
#     """
#     # Load JSON if it's a string
#     if isinstance(notebook_json, str):
#         notebook_json = json.loads(notebook_json)
    
#     # Extract markdown cells
#     markdown_cells = [
#         "\n".join(cell["source"]) for cell in notebook_json.get("cells", []) if cell.get("cell_type") == "markdown"
#     ]
    
#     # Combine all markdown content into a single string
#     return "\n\n".join(markdown_cells)

# def process_notebook_source(nb_source: str) -> str:
#     """Extract and merge code cells from nb_source"""
#     try:
#         parsed = json.loads(nb_source)
#     except (json.JSONDecodeError, TypeError):
#         try:
#             parsed = ast.literal_eval(nb_source)
#         except (ValueError, SyntaxError) as e:
#             print(f"Error parsing nb_source: {e}")
#             return []

#     if isinstance(parsed, dict) and 'cells' in parsed:
#         cells = parsed['cells']
#     elif isinstance(parsed, list):
#         cells = parsed
#     else:
#         print(f"nb_source is neither a list nor a dict with 'cells': {parsed}")
#         return []

#     if not isinstance(cells, list):
#         print(f"cells is not a list: {cells}")
#         return []

#     # Merge all code cells into one string, preserving original formatting
#     merged_code = []
#     for cell in cells:
#         if isinstance(cell, dict) and cell.get('cell_type') == 'code':
#             code = cell.get('source', '')
#             if isinstance(code, list):
#                 code = '\n'.join(code)  # Join lines from source, preserving indentation
#             elif not isinstance(code, str):
#                 code = str(code)
#             merged_code.append(code)
    
#     full_code = '\n'.join(merged_code)
#     return full_code