# SafeCoder Dataset combining the 4 different datasets in SafeCoder into single jsonl files

Train:
- evol.jsonl
- lmsys.jsonl
- sec-desc.jsonl
- sec-new-desc.jsonl

Val:
- evol.jsonl
- lmsys.jsonl
- sec-desc.jsonl
- sec-new-desc.jsonl


This notebook combines the different jsonl files of datasets into combined jsonl files.

In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from pathlib import Path

# Set up paths
base_dir = './'
data_train_val = os.path.join(base_dir, 'data_train_val_copy')


print("SafeCoder Dataset Inspector")
print(f"Base directory: {base_dir}")
print(f"Training/Validation data: {data_train_val}")


SafeCoder Dataset Inspector
Base directory: ./
Training/Validation data: ./data_train_val_copy


## Load Dataset Files

In [2]:
def load_jsonl_files(directory):
    """Load all JSONL files from a directory"""
    data = {}
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return data
    
    for filename in os.listdir(directory):
        if filename.endswith('.jsonl'):
            filepath = os.path.join(directory, filename)
            records = []
            try:
                with open(filepath, 'r') as f:
                    for line in f:
                        if line.strip():
                            records.append(json.loads(line))
                data[filename] = records
                print(f"Loaded {filename}: {len(records)} samples")
            except Exception as e:
                print(f"Error loading {filename}: {e}")
    
    return data

# Load training and validation data
train_val_data = load_jsonl_files(os.path.join(data_train_val, 'train'))
print("\n--- Training Data ---")
for dataset_name, records in train_val_data.items():
    print(f"{dataset_name}: {len(records)} samples")

val_data = load_jsonl_files(os.path.join(data_train_val, 'val'))
print("\n--- Validation Data ---")
for dataset_name, records in val_data.items():
    print(f"{dataset_name}: {len(records)} samples")

Loaded sec-desc.jsonl: 720 samples
Loaded sec-new-desc.jsonl: 421 samples

--- Training Data ---
sec-desc.jsonl: 720 samples
sec-new-desc.jsonl: 421 samples
Loaded sec-desc.jsonl: 83 samples
Loaded sec-new-desc.jsonl: 44 samples

--- Validation Data ---
sec-desc.jsonl: 83 samples
sec-new-desc.jsonl: 44 samples


In [None]:
def display_jsonl_files_fields(directory):
    """Load and display all JSONL files' fields from a directory"""
    data = {}
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return data
    
    for filename in os.listdir(directory):
        if filename.endswith('.jsonl'):
            filepath = os.path.join(directory, filename)
            records = []
            try:
                with open(filepath, 'r') as f:
                    for line in f:
                        if line.strip():
                            records.append(json.loads(line))
                data[filename] = records
                print(f"Loaded {filename}: {len(records)} samples")
            except Exception as e:
                print(f"Error loading {filename}: {e}")
    
    return data

In [3]:
def combine_all_jsonl(directory, output_path):
    """Combine all JSONL data into a single file"""
    jsonl_files = {}
    
    if not os.path.exists(directory):
        print(f'Directory not found: {directory}')
        return jsonl_files
    
    all_records = []
    for filename in os.listdir(directory):
        print(f'Files: {filename}')
        if filename.endswith('.jsonl'):
            
            filepath = os.path.join(directory, filename)
            print(f'Filepath: {filepath}')
            records = []
            try:
                with open(filepath, 'r') as f:
                    for line in f:
                        if line.strip():
                            records.append(json.loads(line))
                all_records.extend(records)
                print(f"Loaded {filename}: {len(records)} samples")
            except Exception as e:
                print(f"Error loading {filename}: {e}")

           
    # Save to file
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w') as f:
        for record in all_records:
            f.write(json.dumps(record) + '\n')
    
    print(f"\n✓ Successfully combined {len(all_records):,} total records")
    print(f"✓ Saved to: {output_path}")
    return len(all_records)

# Combine all data
modified_dataset_dir = "modified_datasets"
output_file_train = os.path.join(base_dir, modified_dataset_dir, 'combined_sec_train_data.jsonl')
output_file_eval = os.path.join(base_dir, modified_dataset_dir, 'combined_sec_val_data.jsonl')
data_train_val_path = os.path.join(data_train_val, 'train')
data_eval_path = os.path.join(data_train_val, 'val')
total_combined_train = combine_all_jsonl(data_train_val_path, output_file_train)
total_combined_val = combine_all_jsonl(data_eval_path, output_file_eval)


Files: sec-desc.jsonl
Filepath: ./data_train_val_copy/train/sec-desc.jsonl
Loaded sec-desc.jsonl: 720 samples
Files: sec-new-desc.jsonl
Filepath: ./data_train_val_copy/train/sec-new-desc.jsonl
Loaded sec-new-desc.jsonl: 421 samples

✓ Successfully combined 1,141 total records
✓ Saved to: ./modified_datasets/combined_sec_train_data.jsonl
Files: sec-desc.jsonl
Filepath: ./data_train_val_copy/val/sec-desc.jsonl
Loaded sec-desc.jsonl: 83 samples
Files: sec-new-desc.jsonl
Filepath: ./data_train_val_copy/val/sec-new-desc.jsonl
Loaded sec-new-desc.jsonl: 44 samples

✓ Successfully combined 127 total records
✓ Saved to: ./modified_datasets/combined_sec_val_data.jsonl


In [4]:
print("=" * 50)
print("TOTAL SAMPLE COUNTS")
print("=" * 50)
print(f"Training samples:   {total_combined_train:,}")
print(f"Validation samples: {total_combined_val:,}")
print(f"{'─' * 50}")
print(f"GRAND TOTAL:        {total_combined_train + total_combined_val:,}")
print("=" * 50)

TOTAL SAMPLE COUNTS
Training samples:   1,141
Validation samples: 127
──────────────────────────────────────────────────
GRAND TOTAL:        1,268
