In [1]:
from datasets import load_dataset

ds = load_dataset("sahil2801/CodeAlpaca-20k")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 20022/20022 [00:00<00:00, 182201.99 examples/s]


In [2]:
from datasets import load_dataset
import json
import random
import os

def load_codealpaca_sample(sample_size=4000):
    """
    Load CodeAlpaca dataset and take first 4k entries
    """
    print("Loading CodeAlpaca-20k dataset...")
    try:
        ds = load_dataset("sahil2801/CodeAlpaca-20k")
        print(f"Dataset loaded successfully!")
        
        # Get the train split
        train_data = ds['train']
        print(f"Total entries in dataset: {len(train_data)}")
        
        # Convert to pandas DataFrame for easier sampling
        df = train_data.to_pandas()
        print(f"Available columns: {list(df.columns)}")
        
        # Check if we have enough data
        if len(df) < sample_size:
            print(f"Warning: Dataset has only {len(df)} entries, taking all available")
            sample_size = len(df)
        
        # Take first 4k entries
        print(f"Taking first {sample_size} entries...")
        sample_df = df.head(sample_size)
        
        # Create final dataset with instruction, input, output format
        print("Converting to instruction-input-output format...")
        final_dataset = []
        
        for _, row in sample_df.iterrows():
            # Handle different possible column names
            instruction = row.get('instruction', row.get('prompt', ''))
            input_text = row.get('input', row.get('context', ''))
            output_text = row.get('output', row.get('response', row.get('completion', '')))
            
            # Ensure we have instruction and output at minimum
            if instruction and output_text:
                entry = {
                    "instruction": instruction,
                    "input": input_text if input_text else "",
                    "output": output_text
                }
                final_dataset.append(entry)
        
        print(f"Created {len(final_dataset)} valid entries")
        
        # Show statistics
        print("\n" + "="*50)
        print("SAMPLING STATISTICS")
        print("="*50)
        print(f"Original dataset size: {len(train_data)}")
        print(f"First sample size: {sample_size}")
        print(f"Valid entries created: {len(final_dataset)}")
        print("="*50)
        
        # Show preview
        print("\nPREVIEW (First 3 entries):")
        print("-"*30)
        for i, entry in enumerate(final_dataset[:3], 1):
            print(f"Entry {i}:")
            print(f"  Instruction: {entry['instruction'][:100]}...")
            print(f"  Input: {entry['input'][:50]}..." if entry['input'] else "  Input: (empty)")
            print(f"  Output: {entry['output'][:100]}...")
            print()
        
        return final_dataset
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return []

def save_to_jsonl(data, output_path):
    """
    Save data to JSONL file
    """
    print(f"Writing dataset to: {output_path}")
    try:
        with open(output_path, "w", encoding="utf-8") as f:
            for entry in data:
                f.write(json.dumps(entry, ensure_ascii=False) + "\n")
        print(f"✅ Successfully wrote {len(data)} entries to {output_path}")
        return True
    except Exception as e:
        print(f"❌ Error writing file: {e}")
        return False

def main():
    # Load first 4k sample
    sample_data = load_codealpaca_sample(sample_size=4000)
    
    if not sample_data:
        print("Failed to load sample data")
        return
    
    # Save to JSONL file
    output_path = os.path.join(os.getcwd(), "codealpaca_4k_first_sample.jsonl")
    save_to_jsonl(sample_data, output_path)
    
    print(f"\n🎉 Complete! Created {len(sample_data)} coding examples in JSONL format")

if __name__ == "__main__":
    main()

Loading CodeAlpaca-20k dataset...
Dataset loaded successfully!
Total entries in dataset: 20022
Available columns: ['output', 'instruction', 'input']
Taking first 4000 entries...
Converting to instruction-input-output format...
Created 4000 valid entries

SAMPLING STATISTICS
Original dataset size: 20022
First sample size: 4000
Valid entries created: 4000

PREVIEW (First 3 entries):
------------------------------
Entry 1:
  Instruction: Create an array of length 5 which contains all even numbers between 1 and 10....
  Input: (empty)
  Output: arr = [2, 4, 6, 8, 10]...

Entry 2:
  Instruction: Formulate an equation to calculate the height of a triangle given the angle, side lengths and opposi...
  Input: (empty)
  Output: Height of triangle = opposite side length * sin (angle) / side length...

Entry 3:
  Instruction: Write a replace method for a string class which replaces the given string with a given set of charac...
  Input: string = "Hello World!"
replace_with = "Greetings!...
  Outp