In [13]:
import json
import pandas as pd
import os
from pathlib import Path

def json_folder_to_csv(folder_path, output_csv_path):
    """
    Convert all JSON files in a folder to a single CSV file.
    
    Args:
        folder_path (str): Path to folder containing JSON files
        output_csv_path (str): Path for output CSV file
    """
    all_data = []
    
    # Get all JSON files in the folder
    folder = Path(folder_path)
    json_files = list(folder.glob("*.json"))
    
    if not json_files:
        print(f"No JSON files found in {folder_path}")
        return
    
    print(f"Found {len(json_files)} JSON files to process...")
    
    # Process each JSON file
    for json_file in json_files:
        try:
            with open(json_file, 'r') as f:
                data = json.load(f)
                
            # Handle both single objects and arrays
            if isinstance(data, list):
                all_data.extend(data)
            else:
                all_data.append(data)
                
            print(f"Processed: {json_file.name}")
            
        except Exception as e:
            print(f"Error processing {json_file.name}: {e}")
    
    if not all_data:
        print("No data found in JSON files")
        return
    
    # Convert to DataFrame and save as CSV
    df = pd.DataFrame(all_data)
    df.to_csv(output_csv_path, index=False)
    
    print(f"\nSuccessfully converted {len(all_data)} records to {output_csv_path}")
    print(f"CSV shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

# Example usage:
# Convert attempts folder to CSV
json_folder_to_csv("tests/datasets/baseline/attempts/", "tests/attempts_data/baseline_attempts.csv")

# Convert sessions folder to CSV
json_folder_to_csv("tests/datasets/baseline/sessions/", "tests/sessions_data/baseline_sessions.csv")


Found 5 JSON files to process...
Processed: 0.json
Processed: 1.json
Processed: 2.json
Processed: 3.json
Processed: 4.json

Successfully converted 750 records to tests/attempts_data/baseline_attempts.csv
CSV shape: (750, 14)
Columns: ['sid', 'att_n', 'e1', 'e2', 'ok', 'res', 'inv_b4', 'reason', 'novel', 'str_typ', 'str_len', 't_since', '_timestamp', '_datetime']
Found 5 JSON files to process...
Processed: 0.json
Processed: 1.json
Processed: 2.json
Processed: 3.json
Processed: 4.json

Successfully converted 5 records to tests/sessions_data/baseline_sessions.csv
CSV shape: (5, 15)
Columns: ['sid', 'r_typ', 'start', 'start_ts', 'end', 'end_ts', 'tot_att', 'succ_att', 'elem_disc', 'final_inv', 'disc_rate', 'max_succ', 'max_fail', 'plateaus', 'last_disc_t']


In [14]:
import pandas as pd
from pathlib import Path

def concatenate_csv_files(folder_path, output_csv_path, pattern="*.csv"):
    """
    Concatenate multiple CSV files from a folder into a single CSV file.
    
    Args:
        folder_path (str): Path to folder containing CSV files
        output_csv_path (str): Path for the output concatenated CSV file
        pattern (str): File pattern to match (default: "*.csv")
    """
    all_dataframes = []
    
    # Get all CSV files in the folder
    folder = Path(folder_path)
    csv_files = list(folder.glob(pattern))
    
    if not csv_files:
        print(f"No CSV files found in {folder_path}")
        return
    
    print(f"Found {len(csv_files)} CSV files to concatenate...")
    
    # Process each CSV file
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            all_dataframes.append(df)
            print(f"Loaded: {csv_file.name} ({df.shape[0]} rows)")
            
        except Exception as e:
            print(f"Error processing {csv_file.name}: {e}")
    
    if not all_dataframes:
        print("No data found in CSV files")
        return
    
    # Concatenate all DataFrames
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    combined_df.to_csv(output_csv_path, index=False)
    
    print(f"\nSuccessfully concatenated {len(all_dataframes)} files to {output_csv_path}")
    print(f"Combined CSV shape: {combined_df.shape}")
    print(f"Columns: {list(combined_df.columns)}")

# Example usage:
# Concatenate all attempts data files
concatenate_csv_files("tests/attempts_data", "tests/all_attempts_data.csv", pattern="*.csv")

# Concatenate all sessions data files  
concatenate_csv_files("tests/sessions_data", "tests/all_sessions_data.csv", pattern="*.csv")


Found 5 CSV files to concatenate...
Loaded: conterfactual_attempts.csv (729 rows)
Loaded: baseline_attempts.csv (750 rows)
Loaded: abductive_attempts.csv (705 rows)
Loaded: heuristic_attempts.csv (724 rows)
Loaded: first_principles_attempts.csv (750 rows)

Successfully concatenated 5 files to tests/all_attempts_data.csv
Combined CSV shape: (3658, 14)
Columns: ['sid', 'att_n', 'e1', 'e2', 'ok', 'res', 'inv_b4', 'reason', 'novel', 'str_typ', 'str_len', 't_since', '_timestamp', '_datetime']
Found 5 CSV files to concatenate...
Loaded: first_principles_sessions.csv (5 rows)
Loaded: baseline_sessions.csv (5 rows)
Loaded: conterfactual_sessions.csv (5 rows)
Loaded: heuristic_sessions.csv (5 rows)
Loaded: abductive_sessions.csv (5 rows)

Successfully concatenated 5 files to tests/all_sessions_data.csv
Combined CSV shape: (25, 15)
Columns: ['sid', 'r_typ', 'start', 'start_ts', 'end', 'end_ts', 'tot_att', 'succ_att', 'elem_disc', 'final_inv', 'disc_rate', 'max_succ', 'max_fail', 'plateaus', 'las