#
* md5 hash and uuid seem to be good options.
* lets compare the speed of both.
(md5 hash seems to be faster)

In [4]:
import hashlib
import uuid
import time
from statistics import mean, stdev
from typing import List, Tuple

def benchmark(iterations: int = 100000) -> Tuple[List[float], List[float]]:
    """Run benchmark comparing MD5 hash vs UUID generation."""
    url = "https://www.example.com/very/long/path/with/parameters?param1=value1&param2=value2" #*100
    
    hash_times = []
    uuid_times = []
    
    for _ in range(iterations):
        # Benchmark MD5
        start = time.perf_counter()
        hashlib.md5(url.encode()).hexdigest()
        hash_times.append(time.perf_counter() - start)
        
        # Benchmark UUID
        start = time.perf_counter()
        str(uuid.uuid4())
        uuid_times.append(time.perf_counter() - start)
    
    return hash_times, uuid_times

# Run benchmark
iterations = 100000
hash_times, uuid_times = benchmark(iterations)

# Calculate statistics
hash_mean = mean(hash_times) * 1000000  # Convert to microseconds
uuid_mean = mean(uuid_times) * 1000000
hash_std = stdev(hash_times) * 1000000
uuid_std = stdev(uuid_times) * 1000000

print(f"Results over {iterations:,} iterations:\n")
print("MD5 Hash:")
print(f"  Average time: {hash_mean:.2f} microseconds")
print(f"  Std dev:     {hash_std:.2f} microseconds")
print("\nUUID:")
print(f"  Average time: {uuid_mean:.2f} microseconds")
print(f"  Std dev:     {uuid_std:.2f} microseconds")

print(f"\nUUID is {(hash_mean/uuid_mean):.2f}x faster than MD5 hash")

# Generate example outputs
url = "https://ekantipur.com/news/2025/01/07/tibet-earthquake-at-least-95-dead-more-than-130-injured-19-43.html"
print("\nExample outputs:")
print(f"MD5 Hash: {hashlib.md5(url.encode()).hexdigest()}")
print(f"UUID:    {str(uuid.uuid4())}")

Results over 100,000 iterations:

MD5 Hash:
  Average time: 1.36 microseconds
  Std dev:     1.03 microseconds

UUID:
  Average time: 8.19 microseconds
  Std dev:     4.44 microseconds

UUID is 0.17x faster than MD5 hash

Example outputs:
MD5 Hash: 655eb937aace7ce932b572f303293d35
UUID:    e4446d4a-6751-40a8-adfa-c1e24e089ecc


## generate file name using md5_hash
lets create some pickle files with with hashed url  + timestamp as file name and contain dummy data and test our zipping code for those files. 

lets use uuid as zipped file name and create some temp files to make sure our code works as expected.


This test suite:

1. Creates pickle files with:
   - URL hash + timestamp naming
   - Both regular and temp files
   - Dummy data inside each pickle

2. Zips the files with:
   - UUID as zip filename
   - Excludes _temp.pickle files
   - Includes a manifest

3. Verifies:
   - Temp files are excluded
   - Files are properly compressed
   - Manifest is included

When you run it, you'll see:
1. List of created regular and temp files
2. The UUID-based zip filename
3. Contents of the zip file
4. Verification that temp files were excluded

Would you like me to add any additional test cases or verification steps?

In [None]:
# import hashlib
# import time
# from datetime import datetime
# from pathlib import Path
# import os

# class PickleFilePandit:
#     '''
#     * PickleFilePandit names the pickle files
#     '''
#     def __init__(self, base_dir: str = "pickles"):
#         self.base_dir = Path(base_dir)
#         self.base_dir.mkdir(exist_ok=True)
    
#     def get_url_hash(self, url: str) -> str:
#         """Generate MD5 hash of URL."""
#         return hashlib.md5(url.encode()).hexdigest()
    
#     def generate_filename(self, url: str, include_date: bool = True) -> Path:
#         """Generate a unique filename for the URL."""
#         url_hash = self.get_url_hash(url)
        
#         if include_date:
#             # Use timestamp for uniqueness and chronological ordering
#             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
#             filename = f"{url_hash}_{timestamp}.pickle"
#         else:
#             # If you want to overwrite previous files for the same URL
#             filename = f"{url_hash}.pickle"
            
#         return self.base_dir / filename
    
#     def get_files_for_url(self, url: str) -> list[Path]:
#         """Get all pickle files associated with a URL."""
#         url_hash = self.get_url_hash(url)
#         return sorted(self.base_dir.glob(f"{url_hash}_*.pickle"))
    
#     def get_latest_file_for_url(self, url: str) -> Path | None:
#         """Get the most recent pickle file for a URL."""
#         files = self.get_files_for_url(url)
#         return files[-1] if files else None
    
#     def cleanup_old_files(self, url: str, keep_latest_n: int = 5):
#         """Remove old pickle files, keeping only the n most recent ones."""
#         files = self.get_files_for_url(url)
#         for file in files[:-keep_latest_n]:
#             file.unlink()

# # Example usage
# if __name__ == "__main__":
#     pandit = PickleFilePandit()
    
#     # Example URL
#     url = "https://example.com/page1"
    
#     # Generate filename
#     pickle_path = pandit.generate_filename(url)
#     print(f"Generated filename: {pickle_path}")
    
#     # Simulate multiple threads/processes saving files
#     for _ in range(3):
#         filename = pandit.generate_filename(url)
#         with open(filename, 'wb') as f:
#             f.write(b'test data')
#         time.sleep(0.1)  # Simulate some processing time
    
#     # List all files for the URL
#     print("\nAll files for URL:")
#     for file in pandit.get_files_for_url(url):
#         print(f"- {file}")
    
#     # Get latest file
#     latest = pandit.get_latest_file_for_url(url)
#     print(f"\nLatest file: {latest}")
    
#     # Cleanup old files
#     pandit.cleanup_old_files(url, keep_latest_n=2)
#     print("\nAfter cleanup:")
#     for file in pandit.get_files_for_url(url):
#         print(f"- {file}")

Generated filename: pickles/d22158c78143eeca7fa617577d741866_20250107_150032_616503.pickle

All files for URL:
- pickles/d22158c78143eeca7fa617577d741866_20250107_144244_573921.pickle
- pickles/d22158c78143eeca7fa617577d741866_20250107_144244_674361.pickle
- pickles/d22158c78143eeca7fa617577d741866_20250107_145149_330036.pickle
- pickles/d22158c78143eeca7fa617577d741866_20250107_145149_372245_temp.pickle
- pickles/d22158c78143eeca7fa617577d741866_20250107_150032_616696.pickle
- pickles/d22158c78143eeca7fa617577d741866_20250107_150032_718720.pickle
- pickles/d22158c78143eeca7fa617577d741866_20250107_150032_819726.pickle

Latest file: pickles/d22158c78143eeca7fa617577d741866_20250107_150032_819726.pickle

After cleanup:
- pickles/d22158c78143eeca7fa617577d741866_20250107_150032_718720.pickle
- pickles/d22158c78143eeca7fa617577d741866_20250107_150032_819726.pickle


In [None]:
import hashlib
import time
import pickle
from datetime import datetime
import uuid
from pathlib import Path
import zipfile
import random

class PickleTestSuite:
    def __init__(self, base_dir: str = "pickles"):
        self.base_dir = Path(base_dir)
        self.base_dir.mkdir(exist_ok=True)
        
    def generate_url_hash(self, url: str) -> str:
        """Generate MD5 hash of URL."""
        return hashlib.md5(url.encode()).hexdigest()
    
    def create_pickle_filename(self, url: str, is_temp: bool = False) -> str:
        """Create filename with hash and timestamp."""
        url_hash = self.generate_url_hash(url)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        suffix = "_temp.pickle" if is_temp else ".pickle"
        return f"{url_hash}_{timestamp}{suffix}"
    
    def create_dummy_data(self) -> dict:
        """Create some dummy data to pickle."""
        return {
            'timestamp': datetime.now(),
            'random_number': random.randint(1, 1000),
            'sample_text': f"Sample data {random.randint(1, 100)}",
            'metrics': {
                'value1': random.random(),
                'value2': random.random()
            }
        }
    
    def create_test_files(self, num_regular: int = 3, num_temp: int = 2) -> tuple[list[Path], list[Path]]:
        """Create test pickle files."""
        regular_files = []
        temp_files = []
        
        # Sample URLs
        urls = [
            "https://example.com/page1",
            "https://example.com/page2",
            "https://example.com/page3",
            "https://example.com/page4",
            "https://example.com/page5"
        ]
        
        # Create regular files
        for i in range(num_regular):
            url = urls[i % len(urls)]
            filename = self.create_pickle_filename(url)
            filepath = self.base_dir / filename
            
            with open(filepath, 'wb') as f:
                pickle.dump(self.create_dummy_data(), f)
            
            regular_files.append(filepath)
            time.sleep(0.01)  # Ensure unique timestamps
        
        # Create temp files
        for i in range(num_temp):
            url = urls[i % len(urls)]
            filename = self.create_pickle_filename(url, is_temp=True)
            filepath = self.base_dir / filename
            
            with open(filepath, 'wb') as f:
                pickle.dump(self.create_dummy_data(), f)
            
            temp_files.append(filepath)
            time.sleep(0.01)  # Ensure unique timestamps
        
        return regular_files, temp_files

def zip_pickles(pickle_dir: str = "pickles") -> str:
    """Zip pickle files using UUID as filename."""
    zip_filename = f"{uuid.uuid4()}.zip"
    
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        pickle_path = Path(pickle_dir)
        
        # Get all non-temp pickle files
        pickle_files = [
            f for f in pickle_path.glob("*.pickle")
            if not f.name.endswith("_temp.pickle")
        ]
        
        # Add files to zip
        for file in pickle_files:
            zipf.write(file, file.name)
        
        # Add manifest
        manifest = f"""Backup created on: {datetime.now()}
                    Total files: {len(pickle_files)}
                    Files included:
                    {chr(10).join(f'- {f.name}' for f in pickle_files)}
                    """
        zipf.writestr("manifest.txt", manifest)
    
    return zip_filename

# Run test
if __name__ == "__main__":
    # Create test suite
    test_suite = PickleTestSuite()
    
    # Create test files
    print("Creating test files...")
    regular_files, temp_files = test_suite.create_test_files(num_regular=4, num_temp=2)
    
    print("\nRegular files created:")
    for file in regular_files:
        print(f"- {file.name}")
    
    print("\nTemp files created:")
    for file in temp_files:
        print(f"- {file.name}")
    
    # Zip the files
    print("\nZipping files...")
    zip_file = zip_pickles()
    
    # Show zip contents
    print(f"\nCreated zip file: {zip_file}")
    print("\nZip contents:")
    with zipfile.ZipFile(zip_file, 'r') as zipf:
        for file in zipf.namelist():
            if file == "manifest.txt":
                print("\nManifest contents:")
                print(zipf.read(file).decode())
            else:
                print(f"- {file}")
    
    # Verify temp files were excluded
    print("\nVerifying exclusion of temp files...")
    with zipfile.ZipFile(zip_file, 'r') as zipf:
        files_in_zip = zipf.namelist()
        for temp_file in temp_files:
            if temp_file.name in files_in_zip:
                print(f"Warning: Temp file {temp_file.name} was incorrectly included!")
            else:
                print(f"Success: Temp file {temp_file.name} was correctly excluded!")

Creating test files...

Regular files created:
- d22158c78143eeca7fa617577d741866_20250107_150110_484189.pickle
- ff824738a7790aa236b456ddb7f31593_20250107_150110_494624.pickle
- 81f56e75d174cd01fe4bdf3e6e536da9_20250107_150110_505332.pickle
- 355bff9458982feae78adad4f78ec912_20250107_150110_516223.pickle

Temp files created:
- d22158c78143eeca7fa617577d741866_20250107_150110_526903_temp.pickle
- ff824738a7790aa236b456ddb7f31593_20250107_150110_537645_temp.pickle

Zipping files...

Created zip file: fb23bbc4-0190-4914-a6f4-82493ef76b70.zip

Zip contents:
- ff824738a7790aa236b456ddb7f31593_20250107_150110_494624.pickle
- d22158c78143eeca7fa617577d741866_20250107_150110_484189.pickle
- 355bff9458982feae78adad4f78ec912_20250107_150110_516223.pickle
- 81f56e75d174cd01fe4bdf3e6e536da9_20250107_150110_505332.pickle

Manifest contents:
Backup created on: 2025-01-07 15:01:10.551214
Total files: 4
Files included:
- ff824738a7790aa236b456ddb7f31593_20250107_150110_494624.pickle
- d22158c78143eec