In [None]:
import numpy as np
import pandas as pd 
import sys

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/arcade-new'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ['KAGGLE_USERNAME'] = ''
os.environ['KAGGLE_KEY'] = ''

from kaggle.api.kaggle_api_extended import KaggleApi

# Authenticate with Kaggle API
api = KaggleApi()
api.authenticate()

# # write out the kaggle.json file anyhow
# kaggle_token = {"username": os.environ['KAGGLE_USERNAME'],
#              "key": os.environ['KAGGLE_KEY']}

# os.makedirs("~/.kaggle", exist_ok=True)
# with open("~/.kaggle/kaggle.json", "w") as f:
#     json.dump(kaggle_token, f)

# # Set permissions for the file
# os.chmod("~/.kaggle/kaggle.json", 600)

# Install ARCADE

In [None]:
!git clone https://github.com/google-research/arcade-nl2code.git

In [None]:
# createa a package out of arcade and install it
setup_content = """
from setuptools import setup, find_packages

setup(
    name='arcade_nl2code',
    version='0.1',
    packages=find_packages(),
    install_requires=[
        'tensorflow',  # Add other dependencies here
    ],
)
"""

with open('/kaggle/working/arcade-nl2code/setup.py', 'w') as file:
    file.write(setup_content)

def create_init_files(directory):
    for root, dirs, files in os.walk(directory):
        for dir_name in dirs:
            init_file_path = os.path.join(root, dir_name, '__init__.py')
            if not os.path.exists(init_file_path):
                with open(init_file_path, 'w') as f:
                    f.write("# This file makes the directory a Python package\n")
                print(f"Created: {init_file_path}")

directory = '//kaggle/working/arcade-nl2code'
create_init_files(directory)

In [None]:
!pip install /kaggle/working/arcade-nl2code

In [None]:
# add the package to python path
sys.path.append('/kaggle/working/arcade-nl2code')
sys.path.append('/kaggle/working/arcade-nl2code/arcade_nl2code')
sys.path.append('/kaggle/working/arcade-nl2code/arcade_nl2code/annotated_dataset')
print(sys.path)

In [None]:
# create a requirements file for possible versions 
# reqs_2022 = """
# tensorflow-cpu==2.10.0
# absl-py==1.3.0
# pandas==1.5.2
# dacite==1.7.0
# nbformat==5.7.0
# dill==0.3.6
# sacrebleu==2.3.1
# astor==0.8.1
# folium==0.12.1
# seaborn==0.12.2
# vega==3.5.0
# bokeh==2.4.3
# plotly==5.10.0
# matplotlib==3.6.2
# chart_studio==1.1.0
# """

# with open('/kaggle/working/arcade-nl2code/requirements_2022.txt', 'w') as file:
#     file.write(reqs_2022)

In [None]:
#!pip install -r /kaggle/working/arcade-nl2code/arcade_nl2code/evaluation/requirements.txt
# !pip install -r /kaggle/working/arcade-nl2code/requirements_2022.txt
# !pip install seqio
# !pip install diff_match_patch  # was missing in requirements

In [None]:
%%bash

pip show tensorflow
pip show tensorflow-text

# Download ARCADE

In [None]:
!kaggle datasets download -d googleai/arcade-nl2code-dataset -p arcade_nl2code/annotated_dataset/dataset/

In [None]:
%cd /kaggle/working/arcade_nl2code/annotated_dataset/dataset
!unzip -o arcade-nl2code-dataset.zip

# Build Dataset

In [None]:
#pip install --force-reinstall pandas==1.3.0

In [None]:
pip install --upgrade --force-reinstall pandas

In [None]:
#
import pandas as pd

# remove known missing datasets to avoid load errors
new_dataset = pd.read_json("/kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/dataset.json")
new_dataset = new_dataset[~(
    (new_dataset.dataset == 'student-performance-gce-al-exam-2020-sri-lanka') |
    (new_dataset.dataset == 'finding-donors-for-charityml') |
    (new_dataset.dataset == 'alternative-fuel-vehicles-in-the-us') |
    (new_dataset.dataset == 'full-filled-brain-stroke-dataset') |
    (new_dataset.dataset == 'gameloft-android-games-collection-2022') |
    (new_dataset.dataset == 'russia-to-world-trade14m-data-points') |
        (new_dataset.dataset == 'rafael-nadal') |
     (new_dataset.dataset == 'top-5000-albums-of-all-time-spotify-features') 
    
)]
new_dataset.to_json("/kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/dataset.json", orient='records')

kaggle_dataset_prov = pd.read_csv("/kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/kaggle_dataset_provenance.csv")
kaggle_dataset_prov = kaggle_dataset_prov[~(
    (kaggle_dataset_prov.ref=='sasikaamarasinghe/student-performance-gce-al-exam-2020-sri-lanka') |
    (kaggle_dataset_prov.ref=='nancyalaswad90/finding-donors-for-charityml') |
    (kaggle_dataset_prov.ref=='saketpradhan/alternative-fuel-vehicles-in-the-us') |
    (kaggle_dataset_prov.ref=='zzettrkalpakbal/full-filled-brain-stroke-dataset') |
    (kaggle_dataset_prov.ref=='azminetoushikwasi/gameloft-android-games-collection-2022') |
    (kaggle_dataset_prov.ref=='pranav941/russia-to-world-trade14m-data-points') |
    (kaggle_dataset_prov.ref=='ankanhore545/rafael-nadal') |
     (kaggle_dataset_prov.ref=='lucascantu/top-5000-albums-of-all-time-spotify-features')
)]
kaggle_dataset_prov['Version'] = kaggle_dataset_prov['Version'].fillna(0.0)
kaggle_dataset_prov.to_csv("/kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/kaggle_dataset_provenance.csv", index=False)

In [None]:
new_dataset[(
    (new_dataset.dataset == 'student-performance-gce-al-exam-2020-sri-lanka') |
    (new_dataset.dataset == 'finding-donors-for-charityml') |
    (new_dataset.dataset == 'alternative-fuel-vehicles-in-the-us')
)]

In [None]:
kaggle_dataset_prov[(
    (kaggle_dataset_prov.ref=='sasikaamarasinghe/student-performance-gce-al-exam-2020-sri-lanka') |
    (kaggle_dataset_prov.ref=='nancyalaswad90/finding-donors-for-charityml') |
        (kaggle_dataset_prov.ref=='saketpradhan/alternative-fuel-vehicles-in-the-us')
)]

In [None]:
# break up into parts to get around kaggle request limits

def divide_dataset(dataset, rows_per_part):
    """
    Divides the dataset into parts with a specified number of rows per part.

    Parameters:
    - dataset (pd.DataFrame): The dataset to divide.
    - rows_per_part (int): The number of rows per part.

    Returns:
    - list of pd.DataFrame: A list containing the divided parts of the dataset.
    """
    return [dataset[i:i + rows_per_part] for i in range(0, len(dataset), rows_per_part)]

kaggle_dataset_prov_parts = divide_dataset(kaggle_dataset_prov, 25)

In [None]:
len(kaggle_dataset_prov_parts)

In [None]:
import subprocess

def run_bash_command_for_each_part(dataset_parts):
    """
    Runs the given bash command for each dataset part.

    Parameters:
    - dataset_parts (list): A list of dataset parts (e.g., DataFrames or file paths).
    """
    for i, part in enumerate(dataset_parts):
        print(f"Processing part {i + 1}...")

        print(part)
        # update the list of datasets to process
        part.to_csv("/kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/kaggle_dataset_provenance.csv", index=False)
        
        # Define the bash command
        bash_command = """
        cd /kaggle/working/arcade_nl2code/annotated_dataset
        PYTHONPATH=../../  
        python /kaggle/working/arcade-nl2code/arcade_nl2code/annotated_dataset/build_new_tasks_split.py
        """

        # Run the bash command and capture output
        process = subprocess.run(
            bash_command,
            shell=True,
            executable='/bin/bash',
            capture_output=True,
            text=True  # Ensures output is returned as a string
        )
        
        # Check for errors
        if process.returncode != 0:
            print(f"Error occurred while processing part {i + 1}")
            print(f"Standard Output:\n{process.stdout}")
            print(f"Standard Error:\n{process.stderr}")
        else:
            print(f"Successfully processed part {i + 1}")
            print(f"Standard Output:\n{process.stdout}")

run_bash_command_for_each_part(kaggle_dataset_prov_parts)

# Upload source files to new version of ARCADE dataset

In [None]:
import os
import shutil
import json
import subprocess

def copy_to_new_dataset_version(source_dir, dataset_slug, version_message):
    """
    Copies all files and folders from the source directory to a new version of an existing Kaggle dataset
    using the command-line Kaggle API tools.

    Parameters:
    - source_dir (str): Path to the source directory containing files and folders to copy.
    - dataset_slug (str): The Kaggle dataset slug in the format 'username/dataset-name'.
    - version_message (str): A message describing the changes in the new dataset version.
    """
    # Create a temporary directory to stage files for upload
    temp_dir = "/kaggle/temp_dataset"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)  # Clear the temp directory if it exists
    os.makedirs(temp_dir, exist_ok=True)

    # Copy all files and folders from the source directory to the temp directory
    for item in os.listdir(source_dir):
        source_path = os.path.join(source_dir, item)
        dest_path = os.path.join(temp_dir, item)
        if os.path.isdir(source_path):
            shutil.copytree(source_path, dest_path)
        else:
            shutil.copy2(source_path, dest_path)

    # Create the dataset-metadata.json file in the temp directory
    metadata_file = os.path.join(temp_dir, "dataset-metadata.json")
    metadata_content = {
        "title": dataset_slug.split("/")[-1],  # Use the dataset name as the title
        "id": dataset_slug,  # The dataset slug (e.g., 'username/dataset-name')
        "licenses": [{"name": "CC0-1.0"}]  # Default license
    }
    with open(metadata_file, "w") as f:
        json.dump(metadata_content, f, indent=4)

    # Use the Kaggle CLI to create a new version of the dataset
    print(f"Creating a new version of the dataset: {dataset_slug}")
    try:
        subprocess.run(
            [
                "kaggle", "datasets", "version",
                "-p", temp_dir,
                "-m", version_message,
                "--dir-mode", "zip"
            ],
            check=True
        )
        print("New dataset version created successfully!")
    except subprocess.CalledProcessError as e:
        print(f"Error creating a new dataset version: {e}")

    # Clean up the temporary directory
    shutil.rmtree(temp_dir)

# Example usage
source_directory = "/kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/artifacts"  # Path to the Kaggle working directory
existing_dataset_slug = "existing_dataset_slug"  # Replace with your dataset slug
version_description = "Updated dataset with new files and folders from the working directory."

copy_to_new_dataset_version(source_directory, existing_dataset_slug, version_description)

In [None]:
import shutil
import os

def create_zip_of_folder(folder_path, output_zip_path):
    """
    Create a zip file of the specified folder.

    Parameters:
    - folder_path (str): Path to the folder to be zipped.
    - output_zip_path (str): Path where the zip file will be created (without .zip extension).
    """
    # Ensure the folder exists
    if not os.path.exists(folder_path):
        print(f"Error: Folder {folder_path} does not exist.")
        return

    # Create the zip file
    shutil.make_archive(output_zip_path, 'zip', folder_path)
    print(f"Zip file created at: {output_zip_path}.zip")

# Example usage
folder_to_zip = "/kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/artifacts"
output_zip = "/kaggle/working/arcade_new_datasets"

create_zip_of_folder(folder_to_zip, output_zip)

# Transform to Create New Notebook Dataset

In [None]:
import pandas as pd
import json
import os
from glob import glob
import chardet
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import pprint
import re
import gc

from model_eval import execute_intent_code

def replace_csv_reads_with_dataframe(code, dataset_file_name, dataframe_name="first_n_rows"):
    """
    Detects and replaces instances of `pd.read_csv` in the provided code with a predefined DataFrame.

    Args:
        code (str): The code string to process.
        dataset_file_path (str): The path to the dataset file being replaced.
        dataframe_name (str): The name of the DataFrame to replace `pd.read_csv` calls with.

    Returns:
        str: The modified code with `pd.read_csv` calls replaced.
    """
    # Step 1: Replace direct `pd.read_csv` calls with the file name
    # Match patterns like pd.read_csv('athlete_events.csv') or pd.read_csv("athlete_events.csv")
    direct_read_csv_pattern = rf"pd\.read_csv\(['\"]{re.escape(dataset_file_name)}['\"]\)"
    code = re.sub(direct_read_csv_pattern, dataframe_name, code)

    # Step 2: Replace `pd.read_csv` calls that use a variable
    # Match patterns like pd.read_csv(PATH)
    read_csv_variable_pattern = r"pd\.read_csv\(\s*\w+\s*\)"
    code = re.sub(read_csv_variable_pattern, dataframe_name, code)
    
    return code


def cleanup_exec(outputs):
    '''Cleans up memory after a notebook execution'''

    # Deallocate the variables
    for var_name in outputs.keys():
        if var_name in globals():
            del globals()[var_name]
        elif var_name in locals():
            del locals()[var_name]
    
    # Force garbage collection
    gc.collect()

def calc_total_input_sizes(csv_files):
    """
    Calculate the total size, total rows, and total columns for a list of CSV files.

    Parameters:
    - csv_files (list): List of paths to CSV files.

    Returns:
    - tuple: (total_size_bytes, total_rows, total_columns)
    """
    # Initialize variables to track total size, row count, and column count
    total_size_bytes = 0
    total_rows = 0
    total_columns = 0

    # Iterate through each CSV file
    for csv_file in csv_files:
        try:
            # Get the file size in bytes
            file_size = os.path.getsize(csv_file)
            total_size_bytes += file_size

            # Count the number of rows in the CSV file
            with open(csv_file, 'r') as f:
                row_count = sum(1 for _ in f) - 1  # Subtract 1 for the header row
            total_rows += row_count

            # Count the number of columns in the CSV file
            with open(csv_file, 'r') as f:
                first_line = f.readline()
                column_count = len(first_line.split(','))
            total_columns += column_count

        except Exception as e:
            print(f"Error processing file {csv_file}: {e}")

    return total_size_bytes, total_rows, total_columns

def transform_new_dataset(datasets_json, 
                          artifact_path='/kaggle/input/arcade-new', 
                          n_rows=10, 
                          top_n_entries=None, 
                          specific_nb=None):
    """
    Transforms the ARCADE dataset to the desired format by reading the initial input
    and executing each intent one by one, processing only the top `top_n_entries` entries.
    """
    # Load the JSON file
    with open(datasets_json, 'r') as f:
        data = json.load(f)
    
    # Limit to the top `top_n_entries` if specified
    if top_n_entries is not None:
        data = data[:top_n_entries]
    
    # Number of rows to extract
    N = n_rows
    
    # Extract intent, code pairs, and execute each intent
    rows = []
    for entry in tqdm(data):
        nb_name = entry.get("notebook_name")
        work_dir = entry.get("work_dir")
        print("Running: ", nb_name)

        # Construct the dataset folder path
        dataset_folder_path = os.path.join(artifact_path, work_dir)

        # Find all CSV files in the folder
        csv_files = glob(os.path.join(dataset_folder_path, 'dataset', "*.csv"))
        count_csvs = len(csv_files)
        total_size_bytes, total_rows, total_cols = calc_total_input_sizes(csv_files)
       
        # Print the total size and row count
        print(f"Number input files: {count_csvs}")
        print(f"Total size: {total_size_bytes} bytes")
        print(f"Total rows: {total_rows}")
        print(f"Total cols: {total_cols}")
        print("Input files:", csv_files)
        
        # First turn input are the imports and dataset load, so execute it first
        nb_header = entry.get("turns", [])[0]["input"]

        # Prepend code to change the working directory
        change_dir_code = f"import os\nos.chdir('{dataset_folder_path}')\n"
        nb_header = change_dir_code + nb_header

        print("Executing notebook header")
        exec_state = {"pd": pd}  # Initialize execution state with Pandas
        outputs, exec_state = execute_intent_code(exec_state, nb_header)
        
        # Initialize the execution state with the output from the header execution
        inputs = outputs 

        # Serialize the exec_state using pickle
        # serialized_exec_state = pickle.dumps(exec_state)

        # Mark intents with errors if any of the previous intents had errors
        execute_error = False

        # Check if header had errors
        if "error" in outputs:
            execute_error = True
        
        for i, turn in enumerate(entry.get("turns", [])):
            print("Executing intent:", i)
            intent = turn["turn"]["intent"]["value"]
            code = turn["turn"]["code"]["value"]
            
            # Execute the code intent
            outputs, exec_state = execute_intent_code(exec_state, code)

            # Check if this intent had an error
            if "error" in outputs:
                execute_error = True
            
            # Append the results
            rows.append({
                "nb_name": nb_name,
                "work_dir": work_dir,
                'nb_setup_code': nb_header,
                "intent_number": i,
                "intent": intent,
                "code": code,
                #"exec_state": str(serialized_exec_state),
                "inputs": str(inputs),  # Inputs for this intent
                "outputs": str(outputs),  # Outputs from this intent
                "execute_error": execute_error,
                'error_msg': outputs.get('error', ''),
                'num_intput_files': count_csvs,
                'total_input_size': total_size_bytes,
                'total_input_rows': total_rows,
                'total_input_cols': total_cols
            })

            # Update inputs for the next intent
            inputs = outputs

        # Clean up memory for all notebook outputs
        cleanup_exec(outputs)

    # Create a DataFrame
    df = pd.DataFrame(rows)

    return df
    
# def transform_new_dataset(datasets_json, 
#                       artifact_path= '/kaggle/working/arcade_nl2code/annotated_dataset/dataset/existing_tasks/artifacts', 
#                       n_rows=10, 
#                       top_n_entries=None, 
#                       specific_nb=None):
#     """
#     Transforms the ARCADE dataset to the desired format by reading the initial input
#     and executing each intent one by one, processing only the top `top_n_entries` entries.
#     """
#     # Load the JSON file
#     with open(datasets_json, 'r') as f:
#         data = json.load(f)
    
#     # Limit to the top `top_n_entries` if specified
#     if top_n_entries is not None:
#         data = data[:top_n_entries]
    
#     # Number of rows to extract
#     N = n_rows
    
#     # Extract intent, code pairs, and first N rows of the dataset, and execute each intent
#     rows = []
#     for entry in tqdm(data):
#         nb_name = entry.get("notebook_name")
#         work_dir = entry.get("work_dir")
#         print("Running: ", nb_name)

#         # Construct the dataset folder path
#         dataset_folder_path = os.path.join(artifact_path, work_dir, 'dataset')

#         # change to the folder containing the datasets for this notebook
#         #print("Changing path to:", dataset_folder_path)
#         #os.chdir(dataset_folder_path)
        
#         # Find all CSV files in the folder
#         csv_files = glob(os.path.join(dataset_folder_path, "*.csv"))
#         print(csv_files)
        
#         # Load the first CSV file if any exist
#         if csv_files:
#             dataset_file_path = csv_files[0]  # Use the first CSV file
    
#             # Detect the file encoding
#             with open(dataset_file_path, "rb") as f:
#                 result = chardet.detect(f.read())
#                 encoding = result["encoding"]
    
#             # Use the detected encoding
#             try:
#                 dataset_df = pd.read_csv(dataset_file_path, encoding=encoding, on_bad_lines='skip')
#                 first_n_rows = pd.DataFrame(dataset_df.head(N))  # Convert to DataFrame
#             except Exception as e:
#                 print("Error reading first n rows", e)
#                 first_n_rows = pd.DataFrame()
                
#         else:
#             first_n_rows = None  # Handle missing dataset files
        
#         # First turn input are the imports and dataset load, so execute it first
#         nb_header = entry.get("turns", [])[0]["input"]

#         # Replace CSV reads with the first_n_rows DataFrame
#         if first_n_rows is not None:
#             nb_header = replace_csv_reads_with_dataframe(nb_header, 
#                                                          os.path.join('dataset', os.path.basename(dataset_file_path)), 
#                                                          dataframe_name="first_n_rows")

#         print("Executing nb header")
#         exec_state = {"pd": pd, "first_n_rows": first_n_rows}  # Add first_n_rows to exec_state
#         outputs, exec_state = execute_intent_code(exec_state, nb_header)
        
#         # Initialize the execution state with the output from the header execution
#         inputs = outputs 

#         # Serialize the exec_state using pickle
#         #serialized_exec_state = pickle.dumps(exec_state)

#         # we mark an intents with erorr if any of the previous intents had errors
#         execute_error = False

#         # check if header had errors
#         if "error" in outputs:
#             execute_error = True
        
#         for i, turn in enumerate(entry.get("turns", [])):
#             print("Executing intent:", i)
#             intent = turn["turn"]["intent"]["value"]
#             code = turn["turn"]["code"]["value"]
            
#             # Execute the code intent
#             outputs, exec_state = execute_intent_code(exec_state, code)

#             # check if this intent had an error
#             if "error" in outputs:
#                 execute_error=True
            
#             # Append the results
#             rows.append({
#                 "nb_name": nb_name,
#                 "work_dir": work_dir,
#                 'nb_setup_code': nb_header,
#                 "intent_number": i,
#                 "intent": intent,
#                 "code": code,
#                 #"exec_state": str(serialized_exec_state),
#                 "inputs": str(inputs),  # Inputs for this intent
#                 "outputs": str(outputs),  # Outputs from this intent
#                 "execute_error": execute_error,
#                 'error_msg': outputs.get('error','')
#             })

#             # Update inputs for the next intent
#             inputs = outputs

#         #clean up memory for all notebook ouputs
#         cleanup_exec(outputs)

#     # Create a DataFrame
#     df = pd.DataFrame(rows)

#     return df

def save_to_pickle(df, file_path='arcade_existing_transformed.pkl'):
    # Extract just what you need, with code as raw strings
    extracted_data = []
    for _, row in df.iterrows():
        entry = {
            'nb_name': row['nb_name'],
            'work_dir': row['work_dir'],
            'intent_number': row['intent_number'],
            'intent': row['intent'],
            'code': row['code'],  # This preserves exact formatting
            'nb_setup_code': row['nb_setup_code'],
            'inputs': row['inputs'],
            'outputs': row['outputs'],
            'execute_error': row['execute_error']
        }
        extracted_data.append(entry)
    
    # Save using pickle to preserve exact string representation
    with open(file_path, 'wb') as f:
        pickle.dump(extracted_data, f)
    
    print("Saved data with preserved formatting to 'arcade_existing_transformed.pkl'")

def load_from_pkl(file_path):
    # Load the pickled data (list of dictionaries)
    with open(file_path, 'rb') as f:
        extracted_data = pickle.load(f)
    
    # Convert the list of dictionaries to a DataFrame
    df_loaded = pd.DataFrame(extracted_data)
    
    # Verify the data loaded correctly
    print(f"Loaded DataFrame with shape: {df_loaded.shape}")
    print(f"Columns: {df_loaded.columns.tolist()}")
    
    # Check a sample of the code to ensure formatting is preserved
    if len(df_loaded) > 0:
        print("\nSample code from first row:")
        print("-" * 50)
        print(df_loaded.iloc[0]['code'])
        print("-" * 50)
    return df_loaded


def divide_and_process_dataset(json_path, artifact_path, n_parts, n_rows_output, start=0, end=None):
    """
    Divide the dataset.json into n parts and process each part using transform_new_dataset.

    Parameters:
    - json_path (str): Path to the original dataset.json file.
    - artifact_path (str): Path to the artifact directory.
    - n_parts (int): Number of parts to divide the dataset into.
    - n_rows_output (int): Number of output rows to include.
    - start (int): Starting index of the parts to process (inclusive).
    - end (int): Ending index of the parts to process (exclusive). If None, process until the last part.
    """
    # Load the dataset.json into a DataFrame
    with open(json_path, 'r') as f:
        dataset = json.load(f)
    
    df = pd.DataFrame(dataset)

    # Divide the DataFrame into n parts
    chunk_size = len(df) // n_parts
    dataset_parts = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Ensure the output directory exists
    os.makedirs(artifact_path, exist_ok=True)

    # Adjust the end parameter if not provided
    if end is None:
        end = len(dataset_parts)

    # Process only the specified range of parts
    for i, part in enumerate(dataset_parts[start:end], start=start):
        print(f"Processing part {i + 1}/{len(dataset_parts)}...")

        # Save the current part to a temporary JSON file
        part_path = os.path.join(artifact_path, f"dataset_part_{i + 1}.json")
        part.to_json(part_path, orient='records', lines=False)

        # Call the transform_new_dataset function for this part
        df_part = transform_new_dataset(
            part_path,
            artifact_path=artifact_path,
            n_rows=n_rows_output,
            top_n_entries=None  # Adjust as needed
        )

        # Display the processed DataFrame for this part
        print(f"Processed DataFrame for part {i + 1}:")
        print(df_part)

        # Count errors in the processed DataFrame
        error_count = df_part['execute_error'].sum()
        print(f"Number of notebooks with errors: {error_count}")

        # Save the processed DataFrame to a pickle file
        save_to_pickle(df_part, f'/kaggle/working/arcade_new_transformed_{i}.pkl')

        # Test whether we can read back the DataFrame from the pickle file
        df_loaded = load_from_pkl(f'/kaggle/working/arcade_new_transformed_{i}.pkl')
        
        # Verify the data loaded correctly
        print(f"Loaded DataFrame with shape: {df_loaded.shape}")
        print(f"Columns: {df_loaded.columns.tolist()}")
        
        # Check a sample of the code to ensure formatting is preserved
        if len(df_loaded) > 0:
            print("\nSample code from first row:")
            print("-" * 50)
            print(df_loaded.iloc[0]['code'])
            print("-" * 50)

        # Clear memory explicitly
        del part
        gc.collect()

In [None]:
# Transform arcade new to our dataset format
df = transform_new_dataset(
    datasets_json='/kaggle/working/arcade_nl2code/annotated_dataset/dataset/new_tasks/dataset.json',
    artifact_path='/kaggle/input/arcade-new',
    n_rows=10,
    top_n_entries=None  # Adjust as needed
)

# Save the processed DataFrame to a pickle file
save_to_pickle(df, f'/kaggle/working/arcade_new_transformed.pkl')

In [None]:
# Test whether we can read back the DataFrame from the pickle file
df = load_from_pkl(f'/kaggle/working/arcade_new_transformed.pkl')

#counf of notebooks
print(f"Number of notebooks: {df['nb_name'].nunique()}")


# Count errors in the processed DataFrame
error_count = df['execute_error'].sum()
print(f"Number of notebooks with errors: {error_count}")

# Verify the data loaded correctly
print(f"Loaded DataFrame with shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Check a sample of the code to ensure formatting is preserved
if len(df) > 0:
    print("\nSample code from first row:")
    print("-" * 50)
    print(df.iloc[0]['code'])
    print("-" * 50)

In [None]:
print(df.nb_setup_code.iloc[0])

In [None]:
# at which intent do the errors typically start
df[df.execute_error == True].intent_number.value_counts().sort_index()

In [None]:
df[df.execute_error == True].outputs

In [None]:
# # part 1
# divide_and_process_dataset(
    
#     n_parts=5,  # Divide into 5 parts
#     n_rows_output=10,
#     start=0,  
#     end=1 
# )

In [None]:
# def concatenate_pickles(file_paths, output_path):
#     """
#     Concatenate multiple pickle files into a single DataFrame and save the result.

#     Parameters:
#     - file_paths (list): List of file paths to the pickle files.
#     - output_path (str): Path to save the concatenated DataFrame as a pickle file.
#     """
#     dataframes = []

#     # Load each pickle file and append to the list
#     for file_path in file_paths:
#         print(f"Loading {file_path}...")
#         df = pd.read_pickle(file_path)
#         dataframes.append(df)

#     # Concatenate all DataFrames
#     concatenated_df = pd.concat(dataframes, ignore_index=True)

#     # Save the concatenated DataFrame to a new pickle file
#     concatenated_df.to_pickle(output_path)
#     print(f"Concatenated DataFrame saved to {output_path}")

#     return concatenated_df

# List of pickle files to concatenate
# file_paths = [
#     '/kaggle/working/arcade_new_transformed_0.pkl',
#     '/kaggle/working/arcade_new_transformed_1.pkl',
#     '/kaggle/working/arcade_new_transformed_2.pkl',
#     '/kaggle/working/arcade_new_transformed_3.pkl',
#     '/kaggle/working/arcade_new_transformed_4.pkl'
# ]

# # Output path for the concatenated DataFrame
# output_path = '/kaggle/working/arcade_new_transformed.pkl'

# # Concatenate the files
# concatenated_df = concatenate_pickles(file_paths, output_path)

# # Display the concatenated DataFrame
# print("Concatenated DataFrame:")
# print(concatenated_df)