In [None]:
import numpy as np
import pandas as pd 
import sys

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working/arcade_nl2code/annotated_dataset/dataset/existing_tasks/artifacts'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ['KAGGLE_USERNAME'] = 'KAGGLE_USERNAME'
os.environ['KAGGLE_KEY'] = 'KAGGLE_KEY'

from kaggle.api.kaggle_api_extended import KaggleApi

# Authenticate with Kaggle API
api = KaggleApi()
api.authenticate()

# Install ARCADE

In [None]:
!git clone https://github.com/google-research/arcade-nl2code.git

In [None]:
# createa a package out of arcade and install it
setup_content = """
from setuptools import setup, find_packages

setup(
    name='arcade_nl2code',
    version='0.1',
    packages=find_packages(),
    install_requires=[
        'tensorflow',  # Add other dependencies here
    ],
)
"""

with open('/kaggle/working/arcade-nl2code/setup.py', 'w') as file:
    file.write(setup_content)

def create_init_files(directory):
    for root, dirs, files in os.walk(directory):
        for dir_name in dirs:
            init_file_path = os.path.join(root, dir_name, '__init__.py')
            if not os.path.exists(init_file_path):
                with open(init_file_path, 'w') as f:
                    f.write("# This file makes the directory a Python package\n")
                print(f"Created: {init_file_path}")

directory = '//kaggle/working/arcade-nl2code'
create_init_files(directory)

In [None]:
!pip install /kaggle/working/arcade-nl2code

In [None]:
# add the package to python path
sys.path.append('/kaggle/working/arcade-nl2code')
sys.path.append('/kaggle/working/arcade-nl2code/arcade_nl2code')
sys.path.append('/kaggle/working/arcade-nl2code/arcade_nl2code/annotated_dataset')
print(sys.path)

In [None]:
# create a requirements file for possible versions 
reqs_2022 = """
tensorflow-cpu==2.10.0
absl-py==1.3.0
pandas==1.5.2
dacite==1.7.0
nbformat==5.7.0
dill==0.3.6
sacrebleu==2.3.1
astor==0.8.1
folium==0.12.1
seaborn==0.12.2
vega==3.5.0
bokeh==2.4.3
plotly==5.10.0
matplotlib==3.6.2
chart_studio==1.1.0
"""

with open('/kaggle/working/arcade-nl2code/requirements_2022.txt', 'w') as file:
    file.write(reqs_2022)

In [None]:
#!pip install -r /kaggle/working/arcade-nl2code/arcade_nl2code/evaluation/requirements.txt
!pip install -r /kaggle/working/arcade-nl2code/requirements_2022.txt
!pip install seqio
!pip install diff_match_patch  # was missing in requirements

In [None]:
%%bash

pip show tensorflow
pip show tensorflow-text

# Download ARCADE

In [None]:
!kaggle datasets download -d googleai/arcade-nl2code-dataset -p arcade_nl2code/annotated_dataset/dataset/

In [None]:
%cd /kaggle/working/arcade_nl2code/annotated_dataset/dataset
!unzip -o arcade-nl2code-dataset.zip

# Build Dataset

## Existing Tasks

In [None]:
%%bash

cd /kaggle/working/arcade_nl2code/annotated_dataset
PYTHONPATH=../../ 
python /kaggle/working/arcade-nl2code/arcade_nl2code/annotated_dataset/build_existing_tasks_split.py

## Transform Data Set

In [None]:
pip install --upgrade --force-reinstall pandas

In [None]:
import pandas as pd
import json
import os
from glob import glob
import chardet
from tqdm import tqdm
import matplotlib.pyplot as plt

def execute_intent_code(exec_state, code):
    """
    Executes the given code in the provided execution state.
    Returns the updated execution state and any outputs, capturing only primitive types, tuples, 
    and DataFrames (DataFrames are stored in JSON format).
    """
    try:
        # Use a non-interactive backend for matplotlib to suppress plots
        plt.switch_backend('Agg')
        
        # Execute the code in the provided execution state
        exec(code, exec_state)
        
        # Clear any matplotlib figures created during execution
        plt.close('all')
        
        # Capture the outputs (all variables in the execution state)
        outputs = {}
        for key, value in exec_state.items():
            if not key.startswith("__"):
                if isinstance(value, (int, float, str, bool, tuple)):
                    outputs[key] = value
                elif isinstance(value, pd.DataFrame):
                    # Convert DataFrame to JSON format
                    outputs[key] = value.to_json(orient="records")
    except Exception as e:
        outputs = {"error": str(e)}
    
    return outputs, exec_state

def transform_dataset(datasets_json, n_rows=10, top_n_entries=None):
    """
    Transforms the ARCADE dataset to the desired format by reading the initial input
    and executing each intent one by one, processing only the top `top_n_entries` entries.
    """
    # Load the JSON file
    with open(datasets_json, 'r') as f:
        data = json.load(f)
    
    # Limit to the top `top_n_entries` if specified
    if top_n_entries is not None:
        data = data[:top_n_entries]
    
    # Number of rows to extract
    N = n_rows
    
    ARTIFACT_PATH = '/kaggle/working/arcade_nl2code/annotated_dataset/dataset/existing_tasks/artifacts'
    
    # Extract intent, code pairs, and first N rows of the dataset, and execute each intent
    rows = []
    for entry in tqdm(data):
        nb_name = entry.get("notebook_name")
        work_dir = entry.get("work_dir")
        
        # Construct the dataset folder path
        dataset_folder_path = os.path.join(ARTIFACT_PATH, work_dir)
        
        # Find all CSV files in the folder
        csv_files = glob(os.path.join(dataset_folder_path, "*.csv"))
        
        # Load the first CSV file if any exist
        if csv_files:
            dataset_file_path = csv_files[0]  # Use the first CSV file
    
            # Detect the file encoding
            with open(dataset_file_path, "rb") as f:
                result = chardet.detect(f.read())
                encoding = result["encoding"]
    
            # Use the detected encoding
            dataset_df = pd.read_csv(dataset_file_path, encoding=encoding)
            first_n_rows = pd.DataFrame(dataset_df.head(N))  # Convert to DataFrame
        else:
            first_n_rows = None  # Handle missing dataset files
       
        # First turn input are the imports and dataset load, so execute it first
        nb_header = entry.get("turns", [])[0]["input"]

        # Replace CSV reads with the first_n_rows DataFrame
        if first_n_rows is not None:
            dataset_file_name = os.path.basename(dataset_file_path)
            nb_header = nb_header.replace(
                f"pd.read_csv('{dataset_file_name}')", "first_n_rows"
            ).replace(
                f'pd.read_csv("{dataset_file_name}")', "first_n_rows")

        exec_state = {"pd": pd, "first_n_rows": first_n_rows}  # Add first_n_rows to exec_state
        outputs, exec_state = execute_intent_code(exec_state, nb_header)
        
        # Initialize the execution state with the output from the header execution
        inputs = outputs 

        # Serialize the exec_state using pickle
        serialized_exec_state = pickle.dumps(exec_state)
        
        for i, turn in enumerate(entry.get("turns", [])):
            intent = turn["turn"]["intent"]["value"]
            code = turn["turn"]["code"]["value"]
            
            # Execute the code intent
            outputs, exec_state = execute_intent_code(exec_state, code)
            
            # Append the results
            rows.append({
                "nb_name": nb_name,
                "work_dir": work_dir,
                'nb_setup_code': nb_header,
                "intent_number": i,
                "intent": intent,
                "code": code,
                "exec_state": str(serialized_exec_state),
                "inputs": inputs,  # Inputs for this intent
                "outputs": outputs,  # Outputs from this intent
            })

            # Update inputs for the next intent
            inputs = outputs

    # Create a DataFrame
    df = pd.DataFrame(rows)

    return df

df = transform_dataset(
    '/kaggle/working/arcade_nl2code/annotated_dataset/dataset/existing_tasks/dataset.json',
    n_rows=10,
    top_n_entries=None  # limit num notebooks to process
)
# Display the DataFrame
df

In [None]:
df.to_csv("arcade_existing_transformed.csv")

## New Tasks

In [None]:
%%bash

cd /kaggle/working/arcade_nl2code/annotated_dataset
PYTHONPATH=../../  
python /kaggle/working/arcade-nl2code/arcade_nl2code/annotated_dataset/build_new_tasks_split.py

# Preprocess Datasets

In [None]:
!sudo apt-get install faketime

## Existing Tasks

In [None]:
%%bash

DATASET_ROOT=/kaggle/working/arcade_nl2code/annotated_dataset/dataset
MAX_PROMPT_SIZE=900
faketime "2022-12-10 12:00:00" python -m arcade_nl2code.annotated_dataset.generate_schema_augmented_prompts \
    --dataset ${DATASET_ROOT}/existing_tasks/dataset.json \
    --output_folder ${DATASET_ROOT}/existing_tasks/derived_datasets/ \
    --runtime_artifacts_root ${DATASET_ROOT}/existing_tasks/artifacts/ \
    --schema_representation_method "originating_dfs.header_description.after_variable_cell" \
    --max_prompt_size ${MAX_PROMPT_SIZE} \
    --truncate_metadata_path ${DATASET_ROOT}/existing_tasks/derived_datasets/dataset.schema.originating_dfs.header_description.after_variable_cell.maxp900.maxp_no_prefix-1.maxctxcell-1.truncate_metadata.json

# Run Evaluation

In [None]:
%%bash

PYTHONPATH=. python /kaggle/working/arcade-nl2code/arcade_nl2code/evaluation/scripts/get_dummy_prediction.py \
    --input /kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/derived_datasets/dataset.+schema.originating_dfs.header_description.after_variable_cell.maxp900.maxp_no_prefix-1.maxctxcell-1.json \
    --output /kaggle/working/arcade_nl2code/evaluation/test_data/dummy_prediction.json

In [None]:
!apt-get update
!apt-get install -y docker.io

In [None]:
!service docker start

In [None]:
%%bash

cd /kaggle/working/arcade-nl2code/arcade_nl2code/evaluation/

docker build -t notebook_evaluator .

PROJECT_ROOT="$(dirname `pwd`)"
docker run -it --shm-size=2g \
  --mount type=bind,source=${PROJECT_ROOT}/evaluation/test_data/,target=/data \
  --mount type=bind,source=${PROJECT_ROOT}/annotated_dataset/dataset/new_tasks/artifacts,target=/artifacts \
  -w / \
  --entrypoint /opt/conda/bin/python \
  notebook_evaluator:latest \
  -m arcade_nl2code.evaluation.execution_evaluation_main \
  --prediction_file /data/dummy_prediction.json \
  --output_path /data/ \
  --runtime_artifact_root /artifacts \
  --lm_output_postprocessor extract_first_cell_block  \
  --split_episode \
  --noreuse_state \
  --timeout 30 \
  --num_workers 20