In [1]:
import json
import os
import random
import argparse
from tqdm import tqdm  # Optional, for progress bar

In [3]:
def reduce_file_size(input_file, output_file, method='both', target_size_mb=10, 
                     sampling_rate=None, fields_to_keep=None):
    """
    Reduce the size of a large JSON file using various methods.
    
    Args:
        input_file: Path to the input file
        output_file: Path to save the reduced file
        method: 'sampling', 'fields', or 'both'
        target_size_mb: Target file size in MB
        sampling_rate: Keep only this fraction of records (0.0-1.0)
        fields_to_keep: List of fields to retain in each record
    """
    # Default fields to keep if none specified
    if fields_to_keep is None:
        fields_to_keep = [
            'accountNumber', 
            'transactionDateTime', 
            'transactionAmount', 
            'merchantName',
            'merchantCategoryCode',
            'isFraud'
        ]
    
    # Calculate file size and adjust sampling rate if needed
    original_size = os.path.getsize(input_file)
    original_size_mb = original_size / (1024 * 1024)
    print(f"Original file size: {original_size_mb:.2f} MB")
    
    # If sampling rate not provided, calculate based on target size
    if sampling_rate is None:
        target_ratio = target_size_mb / original_size_mb
        sampling_rate = min(1.0, target_ratio)
        print(f"Calculated sampling rate: {sampling_rate:.4f}")
    
    # Count lines in input file for progress tracking
    line_count = 0
    with open(input_file, 'r') as f:
        for _ in f:
            line_count += 1
    
    # Process the file
    lines_written = 0
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in tqdm(infile, total=line_count, desc="Processing"):
            line = line.strip()
            if not line:
                continue
                
            try:
                # Apply sampling if needed
                keep_record = True
                if method in ['sampling', 'both']:
                    keep_record = random.random() < sampling_rate
                
                if keep_record:
                    # Apply field reduction if needed
                    if method in ['fields', 'both']:
                        record = json.loads(line)
                        reduced_record = {k: record[k] for k in fields_to_keep if k in record}
                        output_line = json.dumps(reduced_record)
                    else:
                        output_line = line
                        
                    outfile.write(output_line + '\n')
                    lines_written += 1
                    
            except json.JSONDecodeError:
                print(f"Error parsing JSON: {line[:50]}...")
            except Exception as e:
                print(f"Error processing line: {str(e)}")
    
    # Calculate results
    result_size = os.path.getsize(output_file)
    result_size_mb = result_size / (1024 * 1024)
    
    print("\nReduction complete!")
    print(f"Records processed: {line_count}")
    print(f"Records in output: {lines_written}")
    print(f"Original size: {original_size_mb:.2f} MB")
    print(f"Reduced size: {result_size_mb:.2f} MB")
    print(f"Reduction ratio: {(result_size_mb / original_size_mb):.4f}")

In [8]:
# Use the transactions.txt file as input to apply the function
input_file = "transactions.txt"
output_file = "reduced_transactions.txt"
method = "both"
target_size_mb = 10.0
sampling_rate = None
fields_to_keep = None  # Use default fields

reduce_file_size(input_file, output_file, method, target_size_mb, sampling_rate, fields_to_keep)


Original file size: 581.68 MB
Calculated sampling rate: 0.0172


Processing: 100%|██████████| 786363/786363 [00:00<00:00, 1983551.51it/s]


Reduction complete!
Records processed: 786363
Records in output: 13461
Original size: 581.68 MB
Reduced size: 2.51 MB
Reduction ratio: 0.0043



