# Extract Files from JSON

This notebook processes JSON files containing base64-encoded data and extracts them as files to a specified output directory. It handles both .gz compressed files (like .cif.gz) and CSV files.

## Overview
The JSON structure is expected to have an "outputs" section with file names as keys and base64-encoded file contents as values.

## 1. Import Required Libraries

Import all necessary libraries for file operations, JSON processing, and base64 decoding.

In [2]:
import json
import base64
import gzip
import os
from pathlib import Path
import shutil

## 2. Configuration

Set up file paths and configuration parameters.

In [3]:
# Configuration parameters
JSON_FILE_PATH = "out.temp"  # Update this path to your JSON file
OUTPUT_DIRECTORY = "extracted_files"  # Output directory for extracted files

print(f"JSON file path: {JSON_FILE_PATH}")
print(f"Output directory: {OUTPUT_DIRECTORY}")

JSON file path: out.temp
Output directory: extracted_files


## 3. Load JSON Data

Load and parse the JSON file containing the base64-encoded data.

In [None]:
def load_json_data(json_file_path):
    """
    Load JSON data from file and return the parsed content.
    """
    try:
        with open(json_file_path, 'r') as file:
            data = json.load(file)
        print(f"Successfully loaded JSON file: {json_file_path}")
        
        # Check if the JSON has the expected structure
        if 'outputs' in data:
            print(f"Found 'outputs' section with {len(data['outputs'])} items")
            return data
        else:
            print("Warning: 'outputs' section not found in JSON")
            return data
            
    except FileNotFoundError:
        print(f"Error: File {json_file_path} not found")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {json_file_path}")
        return None

# Load the JSON data
json_data = load_json_data(JSON_FILE_PATH)

if json_data:
    print("\nJSON structure:")
    print(f"Request ID: {json_data.get('request_id', 'Not found')}")
    print(f"Status: {json_data.get('status', 'Not found')}")
    if 'outputs' in json_data:
        print(f"Output files found: {list(json_data['outputs'].keys())}")

Successfully loaded JSON file: out.temp
Found 'outputs' section with 7 items

JSON structure:
Request ID: req_6b3e11172ce141b6924b2dddc5ef8efe
Status: success
Output files found: ['pb-msa_model_0.cif.gz', 'pb-msa_model_3.cif.gz', 'pb-msa_model_4.cif.gz', 'pb-msa_model_2.cif.gz', 'pb-msa_model_1.cif.gz', 'pb-msa.score', 'pb-msa_metrics.csv']


## 4. Setup Output Directory

Create the output directory if it doesn't exist.

In [5]:
def setup_output_directory(output_dir):
    """
    Create output directory if it doesn't exist.
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    print(f"Output directory ready: {output_path.absolute()}")
    return output_path

# Setup output directory
output_path = setup_output_directory(OUTPUT_DIRECTORY)

Output directory ready: /home/mekshirs/azurefiles/UW_IPD/extracted_files


## 5. File Extraction Functions

Define functions to handle base64 decoding and file saving for different file types.

In [6]:
def decode_and_save_gz_file(filename, base64_data, output_path):
    """
    Decode base64 data and save as a .gz file.
    """
    try:
        # Decode base64 data
        decoded_data = base64.b64decode(base64_data)
        
        # Create full file path
        file_path = output_path / filename
        
        # Write the decoded data directly as a .gz file
        with open(file_path, 'wb') as f:
            f.write(decoded_data)
        
        print(f"✓ Saved .gz file: {filename} ({len(decoded_data)} bytes)")
        return True
        
    except Exception as e:
        print(f"✗ Error saving {filename}: {e}")
        return False

def decode_and_save_csv_file(filename, base64_data, output_path):
    """
    Decode base64 data and save as a CSV file.
    """
    try:
        # Decode base64 data
        decoded_data = base64.b64decode(base64_data)
        
        # Create full file path
        file_path = output_path / filename
        
        # Write the decoded data as text (CSV is text-based)
        with open(file_path, 'wb') as f:
            f.write(decoded_data)
        
        print(f"✓ Saved CSV file: {filename} ({len(decoded_data)} bytes)")
        return True
        
    except Exception as e:
        print(f"✗ Error saving {filename}: {e}")
        return False

def decode_and_save_generic_file(filename, base64_data, output_path):
    """
    Decode base64 data and save as a generic file.
    """
    try:
        # Decode base64 data
        decoded_data = base64.b64decode(base64_data)
        
        # Create full file path
        file_path = output_path / filename
        
        # Write the decoded data
        with open(file_path, 'wb') as f:
            f.write(decoded_data)
        
        print(f"✓ Saved file: {filename} ({len(decoded_data)} bytes)")
        return True
        
    except Exception as e:
        print(f"✗ Error saving {filename}: {e}")
        return False

## 6. Process and Extract Files

Extract all files from the JSON data based on their file extensions.

In [7]:
def process_files(json_data, output_path):
    """
    Process all files in the JSON data and save them to the output directory.
    """
    if not json_data or 'outputs' not in json_data:
        print("No outputs found in JSON data")
        return
    
    outputs = json_data['outputs']
    
    # Counters for different file types
    gz_files = 0
    csv_files = 0
    other_files = 0
    successful_saves = 0
    
    print(f"\nProcessing {len(outputs)} files...")
    print("-" * 50)
    
    for filename, base64_data in outputs.items():
        # Check file extension and process accordingly
        if filename.endswith('.gz'):
            # Handle .gz files (including .cif.gz)
            if decode_and_save_gz_file(filename, base64_data, output_path):
                gz_files += 1
                successful_saves += 1
                
        elif filename.endswith('.csv'):
            # Handle CSV files
            if decode_and_save_csv_file(filename, base64_data, output_path):
                csv_files += 1
                successful_saves += 1
                
        else:
            # Handle other file types
            if decode_and_save_generic_file(filename, base64_data, output_path):
                other_files += 1
                successful_saves += 1
    
    # Print summary
    print("-" * 50)
    print(f"Extraction Summary:")
    print(f"  .gz files saved: {gz_files}")
    print(f"  .csv files saved: {csv_files}")
    print(f"  Other files saved: {other_files}")
    print(f"  Total successful: {successful_saves}/{len(outputs)}")

# Process the files if JSON data is available
if json_data:
    process_files(json_data, output_path)
else:
    print("Cannot process files - JSON data not loaded")


Processing 7 files...
--------------------------------------------------
✓ Saved .gz file: pb-msa_model_0.cif.gz (60745 bytes)
✓ Saved .gz file: pb-msa_model_3.cif.gz (60828 bytes)
✓ Saved .gz file: pb-msa_model_4.cif.gz (61190 bytes)
✓ Saved .gz file: pb-msa_model_2.cif.gz (60931 bytes)
✓ Saved .gz file: pb-msa_model_1.cif.gz (61087 bytes)
✓ Saved file: pb-msa.score (2299 bytes)
✓ Saved CSV file: pb-msa_metrics.csv (904 bytes)
--------------------------------------------------
Extraction Summary:
  .gz files saved: 5
  .csv files saved: 1
  Other files saved: 1
  Total successful: 7/7


## 7. Verify Extracted Files

Check the output directory and verify that files were extracted correctly.

In [10]:
def verify_extracted_files(output_path):
    """
    Verify the extracted files in the output directory.
    """
    if not output_path.exists():
        print(f"Output directory {output_path} does not exist")
        return
    
    files = list(output_path.iterdir())
    
    if not files:
        print(f"No files found in {output_path}")
        return
    
    print(f"\nFiles in output directory ({output_path}):")
    print("-" * 60)
    
    # Sort files by extension for better organization
    gz_files = [f for f in files if f.name.endswith('.gz')]
    csv_files = [f for f in files if f.name.endswith('.csv')]
    other_files = [f for f in files if not f.name.endswith('.gz') and not f.name.endswith('.csv')]
    
    # Display .gz files
    if gz_files:
        print("\n Compressed (.gz) files:")
        for file in sorted(gz_files):
            size = file.stat().st_size
            print(f"  {file.name:<40} {size:>10,} bytes")
    
    # Display CSV files
    if csv_files:
        print("\n CSV files:")
        for file in sorted(csv_files):
            size = file.stat().st_size
            print(f"  {file.name:<40} {size:>10,} bytes")
    
    # Display other files
    if other_files:
        print("\n Other files:")
        for file in sorted(other_files):
            size = file.stat().st_size
            print(f"  {file.name:<40} {size:>10,} bytes")
    
    print(f"\nTotal files extracted: {len(files)}")

# Verify the extracted files
verify_extracted_files(output_path)


Files in output directory (extracted_files):
------------------------------------------------------------

 Compressed (.gz) files:
  pb-msa_model_0.cif.gz                        60,745 bytes
  pb-msa_model_1.cif.gz                        61,087 bytes
  pb-msa_model_2.cif.gz                        60,931 bytes
  pb-msa_model_3.cif.gz                        60,828 bytes
  pb-msa_model_4.cif.gz                        61,190 bytes

 CSV files:
  pb-msa_metrics.csv                              904 bytes

 Other files:
  pb-msa.score                                  2,299 bytes

Total files extracted: 7
