In [None]:
import numpy as np
import pandas as pd 
import sys

#from model_eval import *  #this is causing library conflicts with arcade

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ['KAGGLE_USERNAME'] = ''
os.environ['KAGGLE_KEY'] = ''

from kaggle.api.kaggle_api_extended import KaggleApi

# Authenticate with Kaggle API
api = KaggleApi()
api.authenticate()

# Install ARCADE

In [None]:
!git clone https://github.com/google-research/arcade-nl2code.git

In [None]:
# createa a package out of arcade and install it
setup_content = """
from setuptools import setup, find_packages

setup(
    name='arcade_nl2code',
    version='0.1',
    packages=find_packages(),
    install_requires=[
        'tensorflow',  # Add other dependencies here
    ],
)
"""

with open('/kaggle/working/arcade-nl2code/setup.py', 'w') as file:
    file.write(setup_content)

def create_init_files(directory):
    for root, dirs, files in os.walk(directory):
        for dir_name in dirs:
            init_file_path = os.path.join(root, dir_name, '__init__.py')
            if not os.path.exists(init_file_path):
                with open(init_file_path, 'w') as f:
                    f.write("# This file makes the directory a Python package\n")
                print(f"Created: {init_file_path}")

directory = '//kaggle/working/arcade-nl2code'
create_init_files(directory)

In [None]:
!pip install /kaggle/working/arcade-nl2code

In [None]:
# add the package to python path
sys.path.append('/kaggle/working/arcade-nl2code')
sys.path.append('/kaggle/working/arcade-nl2code/arcade_nl2code')
sys.path.append('/kaggle/working/arcade-nl2code/arcade_nl2code/annotated_dataset')
print(sys.path)

In [None]:
# create a requirements file for possible versions 
reqs_2022 = """
tensorflow-cpu==2.10.0
absl-py==1.3.0
pandas==1.5.2
dacite==1.7.0
nbformat==5.7.0
dill==0.3.6
sacrebleu==2.3.1
astor==0.8.1
folium==0.12.1
seaborn==0.12.2
vega==3.5.0
bokeh==2.4.3
plotly==5.10.0
matplotlib==3.6.2
chart_studio==1.1.0
"""

with open('/kaggle/working/arcade-nl2code/requirements_2022.txt', 'w') as file:
    file.write(reqs_2022)

In [None]:
#!pip install -r /kaggle/working/arcade-nl2code/arcade_nl2code/evaluation/requirements.txt
!pip install -r /kaggle/working/arcade-nl2code/requirements_2022.txt
!pip install seqio
!pip install diff_match_patch  # was missing in requirements

In [None]:
%%bash

pip show tensorflow
pip show tensorflow-text

# Download ARCADE

In [None]:
!kaggle datasets download -d googleai/arcade-nl2code-dataset -p arcade_nl2code/annotated_dataset/dataset/

In [None]:
%cd /kaggle/working/arcade_nl2code/annotated_dataset/dataset
!unzip -o arcade-nl2code-dataset.zip

# Build Dataset

## Existing Tasks

In [None]:
%%bash

cd /kaggle/working/arcade_nl2code/annotated_dataset
PYTHONPATH=../../ 
python /kaggle/working/arcade-nl2code/arcade_nl2code/annotated_dataset/build_existing_tasks_split.py

## Create a zip of source datasets

In [None]:
import shutil
import os

def create_zip_of_folder(folder_path, output_zip_path):
    """
    Create a zip file of the specified folder.

    Parameters:
    - folder_path (str): Path to the folder to be zipped.
    - output_zip_path (str): Path where the zip file will be created (without .zip extension).
    """
    # Ensure the folder exists
    if not os.path.exists(folder_path):
        print(f"Error: Folder {folder_path} does not exist.")
        return

    # Create the zip file
    shutil.make_archive(output_zip_path, 'zip', folder_path)
    print(f"Zip file created at: {output_zip_path}.zip")

# Example usage
folder_to_zip = "/kaggle/working/arcade_nl2code/annotated_dataset/dataset/existing_tasks/artifacts"
output_zip = "/kaggle/working/arcade_existing_datasets"

create_zip_of_folder(folder_to_zip, output_zip)

## Transform Data Set

In [None]:
#pip install --force-reinstall pandas==1.3.0

In [None]:
pip install --upgrade --force-reinstall pandas

In [None]:
import pandas as pd
import json
import os
from glob import glob
import chardet
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import pprint
import re
import gc

from model_eval import execute_intent_code


def cleanup_exec(outputs):
    '''Cleans up memory after a notebook execution'''

    # Deallocate the variables
    for var_name in outputs.keys():
        if var_name in globals():
            del globals()[var_name]
        elif var_name in locals():
            del locals()[var_name]
    
    # Force garbage collection
    gc.collect()

def calc_total_input_sizes(csv_files):
    """
    Calculate the total size, total rows, and total columns for a list of CSV files.

    Parameters:
    - csv_files (list): List of paths to CSV files.

    Returns:
    - tuple: (total_size_bytes, total_rows, total_columns)
    """
    # Initialize variables to track total size, row count, and column count
    total_size_bytes = 0
    total_rows = 0
    total_columns = 0

    # Iterate through each CSV file
    for csv_file in csv_files:
        try:
            # Get the file size in bytes
            file_size = os.path.getsize(csv_file)
            total_size_bytes += file_size

            # Count the number of rows in the CSV file
            with open(csv_file, 'r') as f:
                row_count = sum(1 for _ in f) - 1  # Subtract 1 for the header row
            total_rows += row_count

            # Count the number of columns in the CSV file
            with open(csv_file, 'r') as f:
                first_line = f.readline()
                column_count = len(first_line.split(','))
            total_columns += column_count

        except Exception as e:
            print(f"Error processing file {csv_file}: {e}")

    return total_size_bytes, total_rows, total_columns

def transform_existing_dataset(datasets_json, 
                          artifact_path='/kaggle/input/arcade-existing', 
                          n_rows=10, 
                          top_n_entries=None, 
                          specific_nb=None):
    """
    Transforms the ARCADE dataset to the desired format by reading the initial input
    and executing each intent one by one, processing only the top `top_n_entries` entries.
    """
    # Load the JSON file
    with open(datasets_json, 'r') as f:
        data = json.load(f)
    
    # Limit to the top `top_n_entries` if specified
    if top_n_entries is not None:
        data = data[:top_n_entries]
    
    # Number of rows to extract
    N = n_rows
    
    # Extract intent, code pairs, and execute each intent
    rows = []
    for entry in tqdm(data):
        nb_name = entry.get("notebook_name")
        work_dir = entry.get("work_dir")
        print("Running: ", nb_name)

        # Construct the dataset folder path
        dataset_folder_path = os.path.join(artifact_path, work_dir)

        # Find all CSV files in the folder
        csv_files = glob(os.path.join(dataset_folder_path, "*.csv"))
        count_csvs = len(csv_files)
        total_size_bytes, total_rows, total_cols = calc_total_input_sizes(csv_files)
       
        # Print the total size and row count
        print(f"Number input files: {count_csvs}")
        print(f"Total size: {total_size_bytes} bytes")
        print(f"Total rows: {total_rows}")
        print(f"Total cols: {total_cols}")
        print("Input files:", csv_files)
        
        # First turn input are the imports and dataset load, so execute it first
        nb_header = entry.get("turns", [])[0]["input"]

        # Prepend code to change the working directory
        change_dir_code = f"import os\nos.chdir('{dataset_folder_path}')\n"
        nb_header = change_dir_code + nb_header

        print("Executing notebook header")
        exec_state = {"pd": pd}  # Initialize execution state with Pandas
        outputs, exec_state = execute_intent_code(exec_state, nb_header)
        
        # Initialize the execution state with the output from the header execution
        inputs = outputs 

        # Serialize the exec_state using pickle
        # serialized_exec_state = pickle.dumps(exec_state)

        # Mark intents with errors if any of the previous intents had errors
        execute_error = False

        # Check if header had errors
        if "error" in outputs:
            execute_error = True
        
        for i, turn in enumerate(entry.get("turns", [])):
            print("Executing intent:", i)
            intent = turn["turn"]["intent"]["value"]
            code = turn["turn"]["code"]["value"]
            
            # Execute the code intent
            outputs, exec_state = execute_intent_code(exec_state, code)

            # Check if this intent had an error
            if "error" in outputs:
                execute_error = True
            
            # Append the results
            rows.append({
                "nb_name": nb_name,
                "work_dir": work_dir,
                'nb_setup_code': nb_header,
                "intent_number": i,
                "intent": intent,
                "code": code,
                #"exec_state": str(serialized_exec_state),
                "inputs": str(inputs),  # Inputs for this intent
                "outputs": str(outputs),  # Outputs from this intent
                "execute_error": execute_error,
                'error_msg': outputs.get('error', ''),
                'num_intput_files': count_csvs,
                'total_input_size': total_size_bytes,
                'total_input_rows': total_rows,
                'total_input_cols': total_cols
            })

            # Update inputs for the next intent
            inputs = outputs

        # Clean up memory for all notebook outputs
        cleanup_exec(outputs)

    # Create a DataFrame
    df = pd.DataFrame(rows)

    return df


In [None]:
# import pandas as pd
# import json
# import os
# from glob import glob
# import chardet
# from tqdm import tqdm
# import matplotlib.pyplot as plt
# import pickle
# import pprint
# import re
# import gc

# def replace_csv_reads_with_dataframe(code, dataset_file_name, dataframe_name="first_n_rows"):
#     """
#     Detects and replaces instances of `pd.read_csv` in the provided code with a predefined DataFrame.

#     Args:
#         code (str): The code string to process.
#         dataset_file_path (str): The path to the dataset file being replaced.
#         dataframe_name (str): The name of the DataFrame to replace `pd.read_csv` calls with.

#     Returns:
#         str: The modified code with `pd.read_csv` calls replaced.
#     """
#     # Step 1: Replace direct `pd.read_csv` calls with the file name
#     # Match patterns like pd.read_csv('athlete_events.csv') or pd.read_csv("athlete_events.csv")
#     direct_read_csv_pattern = rf"pd\.read_csv\(['\"]{re.escape(dataset_file_name)}['\"]\)"
#     code = re.sub(direct_read_csv_pattern, dataframe_name, code)

#     # Step 2: Replace `pd.read_csv` calls that use a variable
#     # Match patterns like pd.read_csv(PATH)
#     read_csv_variable_pattern = r"pd\.read_csv\(\s*\w+\s*\)"
#     code = re.sub(read_csv_variable_pattern, dataframe_name, code)
    
#     return code

# def execute_intent_code(exec_state, code, verbose=False):
#     """
#     Executes the given code in the provided execution state.
#     Returns the updated execution state and any outputs, capturing only primitive types, tuples, 
#     and DataFrames (DataFrames are stored in JSON format).

#     exec_state: python exec namespace

#     examples:

#         for executing notebook header:
        
#             first_n_rows = pd.DataFrame(eval(eval(intents.iloc[0][INPUT_DATA_COL].replace('null', 'None'))['first_n_rows']))
#             exec_state = {"pd": pd, "first_n_rows": first_n_rows}  # Initialize execution state
#             try:
#                 outputs, exec_state = execute_intent_code(exec_state, nb_header, verbose=False)
#                 inputs = outputs  # Initialize inputs with the header execution outputs
#             except Exception as e:
#                 print(f"Error executing notebook header for {nb_name}: {e}")
#                 continue  # Skip this notebook if the header fails

#         for executing intent code (note exec_state would have been previously modified from previous intent code execution)
#             # Execute original code
#             try:
#                 print("Executing original code...")
#                 original_outputs, exec_state = execute_intent_code(exec_state, actual_code, verbose=False)
#             except Exception as e:
#                 print(f"Error executing original code: {e}")
#                 original_outputs = {}
        
#     """
#     error_msg = None
#     try:
#         # Use a non-interactive backend for matplotlib to suppress plots
#         plt.switch_backend('Agg')

#         if verbose:
#             print("IN STATE")
#             print(exec_state)
#             print("CODE")
#             print(code)

#         # Execute the code in the provided execution state
#         exec(code, exec_state)

#         # Clear any matplotlib figures created during execution
#         plt.close('all')
        
#         # Capture the outputs (all variables in the execution state)
#         outputs = {}
#         for key, value in exec_state.items():
#             if not key.startswith("__"):
#                 if isinstance(value, (int, float, str, bool, tuple)):
#                     outputs[key] = value
#                 elif isinstance(value, pd.DataFrame):
#                     # Convert DataFrame to JSON format
#                     outputs[key] = str(value.to_json(orient="records"))

#         if verbose:
#             print("OUT STATE")
#             print(exec_state)
#             print("OUTPUTS")
#             print(outputs)

#     except Exception as e:
#         print("Error in executing code: ", e)
#         outputs = {"error": str(e)}
#         error_msg = str(e)
    
#     return outputs, exec_state, error_msg

# def cleanup_exec(outputs):
#     '''Cleans up memory after a notebook execution'''

#     # Deallocate the variables
#     for var_name in outputs.keys():
#         if var_name in globals():
#             del globals()[var_name]
#         elif var_name in locals():
#             del locals()[var_name]
    
#     # Force garbage collection
#     gc.collect()

# def transform_dataset(datasets_json, n_rows=10, top_n_entries=None, specific_nb=None):
#     """
#     Transforms the ARCADE dataset to the desired format by reading the initial input
#     and executing each intent one by one, processing only the top `top_n_entries` entries.
#     """
#     # Load the JSON file
#     with open(datasets_json, 'r') as f:
#         data = json.load(f)
    
#     # Limit to the top `top_n_entries` if specified
#     if top_n_entries is not None:
#         data = data[:top_n_entries]
    
#     # Number of rows to extract
#     N = n_rows
    
#     ARTIFACT_PATH = '/kaggle/working/arcade_nl2code/annotated_dataset/dataset/existing_tasks/artifacts'
    
#     # Extract intent, code pairs, and first N rows of the dataset, and execute each intent
#     rows = []
#     for entry in tqdm(data):
#         nb_name = entry.get("notebook_name")
#         work_dir = entry.get("work_dir")
#         print("Running: ", nb_name)

#         # Construct the dataset folder path
#         dataset_folder_path = os.path.join(ARTIFACT_PATH, work_dir)

#         # change to the folder containing the datasets for this notebook
#         #os.chdir(dataset_folder_path)
        
#         # Find all CSV files in the folder
#         csv_files = glob(os.path.join(dataset_folder_path, "*.csv"))
        
#         # Load the first CSV file if any exist
#         if csv_files:
#             dataset_file_path = csv_files[0]  # Use the first CSV file
    
#             # Detect the file encoding
#             with open(dataset_file_path, "rb") as f:
#                 result = chardet.detect(f.read())
#                 encoding = result["encoding"]
    
#             # Use the detected encoding
#             dataset_df = pd.read_csv(dataset_file_path, encoding=encoding)
#             first_n_rows = pd.DataFrame(dataset_df.head(N))  # Convert to DataFrame
#         else:
#             first_n_rows = None  # Handle missing dataset files
        
#         # First turn input are the imports and dataset load, so execute it first
#         nb_header = entry.get("turns", [])[0]["input"]

#         # Replace CSV reads with the first_n_rows DataFrame
#         if first_n_rows is not None:
#             nb_header = replace_csv_reads_with_dataframe(nb_header, 
#                                                          os.path.basename(dataset_file_path), 
#                                                          dataframe_name="first_n_rows")

#         exec_state = {"pd": pd, "first_n_rows": first_n_rows}  # Add first_n_rows to exec_state
#         outputs, exec_state, error_msg = execute_intent_code(exec_state, nb_header)
        
#         # Initialize the execution state with the output from the header execution
#         inputs = outputs 

#         # Serialize the exec_state using pickle
#         #serialized_exec_state = pickle.dumps(exec_state)

#         # we mark an intents with erorr if any of the previous intents had errors
#         execute_error = False

#         # check if header had errors
#         if "error" in outputs:
#             execute_error = True
        
#         for i, turn in enumerate(entry.get("turns", [])):
#             intent = turn["turn"]["intent"]["value"]
#             code = turn["turn"]["code"]["value"]
            
#             # Execute the code intent
#             outputs, exec_state, error_msg = execute_intent_code(exec_state, code)

#             # check if this intent had an error
#             if "error" in outputs:
#                 execute_error=True
            
#             # Append the results
#             rows.append({
#                 "nb_name": nb_name,
#                 "work_dir": work_dir,
#                 'nb_setup_code': nb_header,
#                 "intent_number": i,
#                 "intent": intent,
#                 "code": code,
#                 #"exec_state": str(serialized_exec_state),
#                 "inputs": str(inputs),  # Inputs for this intent
#                 "outputs": str(outputs),  # Outputs from this intent
#                 "execute_error": execute_error,
#                 'error_msg': error_msg
#             })

#             # Update inputs for the next intent
#             inputs = outputs

#         #clean up memory for all notebook ouputs
#         cleanup_exec(outputs)

#     # Create a DataFrame
#     df = pd.DataFrame(rows)

#     return df

# df = transform_dataset(
#     '/kaggle/working/arcade_nl2code/annotated_dataset/dataset/existing_tasks/dataset.json',
#     n_rows=10,
#     top_n_entries=None,  # limit num notebooks to process
#     #specific_nb=None #'dataset_athlete_events/notebook_1/annotated.ipynb'
# )
# # Display the DataFrame
# df

In [None]:
# Transform arcade new to our dataset format
df = transform_existing_dataset(
    datasets_json='/kaggle/working/arcade_nl2code/annotated_dataset/dataset/existing_tasks/dataset.json',
    artifact_path='/kaggle/input/arcade-existing-v3',
    n_rows=10,
    top_n_entries=None  # Adjust as needed
)

In [None]:
# an individual notebook
df[df.nb_name == 'dataset_athlete_events/notebook_1/annotated.ipynb']

In [None]:
# Count rows with errors in 'inputs' or 'outputs' 
#df['execute_error'] = (df['inputs'].str.contains('error', case=False, na=False) | df['outputs'].str.contains('error', case=False, na=False))
error_count = df['execute_error'].sum()
print(f"Number of intents with errors: {error_count}")

In [None]:
# top 10 notebooks with errors
df[df['execute_error']].head(10)

In [None]:
# number of intents per notebook
df.groupby('nb_name').intent_number.max().describe()

In [None]:
import pickle

def save_to_pickle(df, file_path='arcade_existing_transformed.pkl'):
    # Extract just what you need, with code as raw strings
    extracted_data = []
    for _, row in df.iterrows():
        entry = {
            'nb_name': row['nb_name'],
            'work_dir': row['work_dir'],
            'intent_number': row['intent_number'],
            'intent': row['intent'],
            'code': row['code'],  # This preserves exact formatting
            'nb_setup_code': row['nb_setup_code'],
            'inputs': row['inputs'],
            'outputs': row['outputs'],
            'execute_error': row['execute_error']
        }
        extracted_data.append(entry)
    
    # Save using pickle to preserve exact string representation
    with open(file_path, 'wb') as f:
        pickle.dump(extracted_data, f)
    
    print("Saved data with preserved formatting to 'arcade_existing_transformed.pkl'")

# Save the processed DataFrame to a pickle file
save_to_pickle(df, f'/kaggle/working/arcade_existing_transformed.pkl')

In [None]:
def load_from_pkl(file_path):
    # Load the pickled data (list of dictionaries)
    with open(file_path, 'rb') as f:
        extracted_data = pickle.load(f)
    
    # Convert the list of dictionaries to a DataFrame
    df_loaded = pd.DataFrame(extracted_data)
    
    # Verify the data loaded correctly
    print(f"Loaded DataFrame with shape: {df_loaded.shape}")
    print(f"Columns: {df_loaded.columns.tolist()}")
    
    # Check a sample of the code to ensure formatting is preserved
    if len(df_loaded) > 0:
        print("\nSample code from first row:")
        print("-" * 50)
        print(df_loaded.iloc[0]['code'])
        print("-" * 50)
    return df_loaded

# test whether we can read back to dataframe
df_loaded = load_from_pkl( f'/kaggle/working/arcade_existing_transformed.pkl')

# Verify the data loaded correctly
print(f"Loaded DataFrame with shape: {df_loaded.shape}")
print(f"Columns: {df_loaded.columns.tolist()}")

# Check a sample of the code to ensure formatting is preserved
if len(df_loaded) > 0:
    print("\nSample code from first row:")
    print("-" * 50)
    print(df_loaded.iloc[0]['code'])
    print("-" * 50)

In [None]:
# check notebook with known error in header is marked with errors
df_loaded[df_loaded.nb_name == 'dataset_chipotle/notebook_1/annotated.ipynb']['execute_error']