<a href="https://colab.research.google.com/github/EmmanuelAdesope/DSA/blob/main/lossylossylossy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import struct
import os

# Function to compress and pack 32-bit float
def compress_and_pack_float_32(x, k):
    """Compress a 32-bit float by zeroing out the last k bits of the mantissa and pack as an integer."""
    if x == 0.0:
        return 0
    binary = struct.unpack('I', struct.pack('f', x))[0]
    mantissa_mask = (1 << 23) - 1           # Bits 0-22
    lower_mask = (1 << k) - 1               # Bits 0 to k-1
    mask = mantissa_mask & ~lower_mask      # Keep bits k to 22
    compressed_binary = (binary & ~mantissa_mask) | (binary & mask)
    return compressed_binary >> k

# Function to unpack compressed data back to 32-bit float
def unpack_float_32(compressed, k):
    """Unpack a compressed integer back to a 32-bit float."""
    restored_binary = compressed << k
    restored_binary = np.uint32(restored_binary)
    return struct.unpack('f', struct.pack('I', restored_binary))[0]

# Function to calculate and print statistics
def calculate_and_print_stats(distributions):
    """Calculate and print mean and std for full-precision and compressed data."""
    for dist_name in distributions:
        full_filename = f'{dist_name}_full.bin'
        if not os.path.exists(full_filename):
            print(f"Error: '{full_filename}' not found.")
            continue
        full_data = np.fromfile(full_filename, dtype=np.float32)
        print(f"\n### {dist_name.capitalize()} Full-Precision Data Statistics ###")
        print(f"Mean: {np.mean(full_data):.10f}")
        print(f"Standard Deviation: {np.std(full_data):.10f}")

        for k in range(8, 17):
            filename = f'{dist_name}_compressed_k{k}.bin'
            if not os.path.exists(filename):
                print(f"Error: '{filename}' not found.")
                continue

            bytes_per_sample = (32 - k + 7) // 8
            if bytes_per_sample == 3:
                packed = np.fromfile(filename, dtype=np.uint8).reshape(-1, 3)
                padded = np.hstack((packed, np.zeros((packed.shape[0], 1), dtype=np.uint8)))
                compressed_int = np.frombuffer(padded.tobytes(), dtype=np.uint32)
            elif bytes_per_sample == 2:
                compressed_int = np.fromfile(filename, dtype=np.uint16).astype(np.uint32)
            else:
                print(f"Unsupported bytes_per_sample ({bytes_per_sample}) for k={k}")
                continue

            compressed_data = np.array([unpack_float_32(val, k) for val in compressed_int], dtype=np.float32)
            print(f"\n### {dist_name.capitalize()} Compressed Data (k={k}) Statistics ###")
            print(f"Mean: {np.mean(compressed_data):.10f}")
            print(f"Standard Deviation: {np.std(compressed_data):.10f}")

# New function to compute and print Mean Squared Error
def calculate_and_print_mse(distributions):
    """Calculate and print MSE between full-precision and compressed data."""
    for dist_name in distributions:
        full_filename = f'{dist_name}_full.bin'
        if not os.path.exists(full_filename):
            print(f"Error: '{full_filename}' not found.")
            continue
        full_data = np.fromfile(full_filename, dtype=np.float32)
        print(f"\n### {dist_name.capitalize()} Mean Squared Error ###")

        for k in range(8, 17):
            filename = f'{dist_name}_compressed_k{k}.bin'
            if not os.path.exists(filename):
                print(f"Error: '{filename}' not found.")
                continue

            bytes_per_sample = (32 - k + 7) // 8
            if bytes_per_sample == 3:
                packed = np.fromfile(filename, dtype=np.uint8).reshape(-1, 3)
                padded = np.hstack((packed, np.zeros((packed.shape[0], 1), dtype=np.uint8)))
                compressed_int = np.frombuffer(padded.tobytes(), dtype=np.uint32)
            elif bytes_per_sample == 2:
                compressed_int = np.fromfile(filename, dtype=np.uint16).astype(np.uint32)
            else:
                print(f"Unsupported bytes_per_sample ({bytes_per_sample}) for k={k}")
                continue

            compressed_data = np.array([unpack_float_32(val, k) for val in compressed_int], dtype=np.float32)
            mse = np.mean((full_data - compressed_data) ** 2)
            print(f"MSE for k={k}: {mse:.10e}")

# Generate datasets for three distributions
n = 1_000_000  # 1 million samples
distributions = {
    'uniform': (1.0 + np.random.uniform(0, 1e-5, n)).astype(np.float32),
    'gaussian': np.random.normal(1.0, 1e-5, n).astype(np.float32),
    'exponential': np.random.exponential(1e-5, n).astype(np.float32)
}
for dist_name, data in distributions.items():
    print(f"Generated {n} 32-bit float samples for {dist_name} distribution.")

# Compress data for k=8 to 16
k_values = list(range(8, 17))  # k = 8, 9, 10, ..., 16
compressed_packed_data = {}
for dist_name, data in distributions.items():
    compressed_packed_data[dist_name] = {}
    for k in k_values:
        compressed_packed_data[dist_name][k] = np.array(
            [compress_and_pack_float_32(x, k) for x in data], dtype=np.uint32
        )
        print(f"Compressed {dist_name} data with k={k} (reducing to {32 - k} bits).")

# Save full-precision and compressed data
for dist_name, data in distributions.items():
    full_filename = f'{dist_name}_full.bin'
    data.tofile(full_filename)
    print(f"Saved full-precision {dist_name} data to '{full_filename}'.")

    for k in k_values:
        bytes_per_sample = (32 - k + 7) // 8
        filename = f'{dist_name}_compressed_k{k}.bin'
        if bytes_per_sample == 3:  # For k=8 to 12 (24 to 20 bits)
            packed = np.frombuffer(compressed_packed_data[dist_name][k].tobytes(), dtype=np.uint8).reshape(-1, 4)[:, :3]
            packed.tofile(filename)
        elif bytes_per_sample == 2:  # For k=13 to 16 (19 to 16 bits)
            compressed_packed_data[dist_name][k].astype(np.uint16).tofile(filename)
        print(f"Saved compressed {dist_name} data (k={k}) to '{filename}' as {bytes_per_sample}-byte integers.")

# Measure file sizes
print("\nFile Sizes:")
for dist_name in distributions:
    full_size = os.path.getsize(f'{dist_name}_full.bin')
    print(f"{dist_name.capitalize()} full-precision file size: {full_size} bytes")

    compressed_sizes = {}
    for k in k_values:
        compressed_sizes[k] = os.path.getsize(f'{dist_name}_compressed_k{k}.bin')
        print(f"{dist_name.capitalize()} compressed file size (k={k}, {32 - k} bits): {compressed_sizes[k]} bytes")

# Calculate storage savings
print("\nStorage Savings:")
for dist_name in distributions:
    full_size = os.path.getsize(f'{dist_name}_full.bin')
    print(f"\n{dist_name.capitalize()}:")
    for k in k_values:
        savings = (full_size - os.path.getsize(f'{dist_name}_compressed_k{k}.bin')) / full_size * 100
        print(f"For k={k} ({32 - k} bits): {savings:.2f}%")

# Calculate and print statistics
calculate_and_print_stats(distributions)

# Calculate and print MSE
calculate_and_print_mse(distributions)

Generated 1000000 32-bit float samples for uniform distribution.
Generated 1000000 32-bit float samples for gaussian distribution.
Generated 1000000 32-bit float samples for exponential distribution.
Compressed uniform data with k=8 (reducing to 24 bits).
Compressed uniform data with k=9 (reducing to 23 bits).
Compressed uniform data with k=10 (reducing to 22 bits).
Compressed uniform data with k=11 (reducing to 21 bits).
Compressed uniform data with k=12 (reducing to 20 bits).
Compressed uniform data with k=13 (reducing to 19 bits).
Compressed uniform data with k=14 (reducing to 18 bits).
Compressed uniform data with k=15 (reducing to 17 bits).
Compressed uniform data with k=16 (reducing to 16 bits).
Compressed gaussian data with k=8 (reducing to 24 bits).
Compressed gaussian data with k=9 (reducing to 23 bits).
Compressed gaussian data with k=10 (reducing to 22 bits).
Compressed gaussian data with k=11 (reducing to 21 bits).
Compressed gaussian data with k=12 (reducing to 20 bits).
C