In [None]:
from datetime import datetime
import hashlib
import os

class PickleUtils:
    @staticmethod
    def _generate_url_hash(url: str) -> str:
        """Generate MD5 hash of URL."""
        return hashlib.md5(url.encode()).hexdigest()
    # print(generate_url_hash('https://example.com/'))    # 182ccedb33a9e03fbf1079b209da1a31
    
    @staticmethod
    def _get_file_name(url: str) -> str:
        """
        Create filename with hash and timestamp.
        
        url_hash: MD5 hash of URL
        timestamp: current timestamp
        filename: {url_hash}_{timestamp}_temp.pickle
        """
        url_hash = PickleUtils._generate_url_hash(url)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        suffix = "_temp.pickle"
        return f"{url_hash}_{timestamp}{suffix}"
    
    @staticmethod
    def save_html(url, html_content):
        '''
            * save html data in filename: hash(url) + _temp.pickle
            * move _temp.pickle -> pickles/<filename>.pickle
        '''
        # 1) get filename
        filename = PickleUtils._get_file_name(url)
        
        # 2) save data to <filename>_temp.pickle
        data = {'url':url, 'html_content':html_content}
        # save to pickle file
        with open(filename, 'wb') as f:
            pickle.dump(data, f)
        
        os.makedirs('pickles', exist_ok=True)
        
        # 3) rename <filename>_temp.pickle to <filename>.pickle
        os.rename(filename, filename.replace('_temp', ''))
    
    def load_pickle(filename=None):
        if not filename:
            filename="fa5b40e417c5cb81fb5c31d6ba6903da_20250110_212913_581403.pickle"
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        return data

if __name__ == '__main__':
    pickle_utils = PickleUtils
    PickleUtils._get_file_name('https://example.com/')   # '182ccedb33a9e03fbf1079b209da1a31_20250107_152420_350083_temp.pickle'

'182ccedb33a9e03fbf1079b209da1a31_20250111_105205_564156_temp.pickle'

# Script to upload zip files to s3/hf periodically

### todo : 
* make sure os.listdir lists pickle files while crawling (i.e. pickle files path is correctly provided)
* set HF_TOKEN in environment variables


In [None]:
from pathlib import Path
import uuid
import zipfile


from s3_v2 import Ec2Functions
from huggingface_hub import HfApi
from python_dotenv import load_dotenv
load_dotenv()
def zip_pickles(pickle_dir: str = "./") -> str:    # self
        """
        * Zip pickle files using UUID as filename.
        * Delete raw files that were zipped
        """
        zip_filename = f"{uuid.uuid4()}.zip"
        
        pickle_path = Path(pickle_dir)
            
        # Get all non-temp pickle files
        pickle_files = [
            f for f in pickle_path.glob("*.pickle")
            if not f.name.endswith("_temp.pickle")
        ]
        
        # 1) Zip .pickle files
        with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
            
            print(f'pickle files: {pickle_files}')
            # Add files to zip
            for file in pickle_files:
                zipf.write(file, file.name)
            
            # Add manifest
            manifest = f"""Backup created on: {datetime.now()}
                        Total files: {len(pickle_files)}
                        Files included:
                        {chr(10).join(f'- {f.name}' for f in pickle_files)}
                        """
            zipf.writestr("manifest.txt", manifest)
        
        # 2) Delete raw files that were zipped
        for file_name in pickle_files:
            os.remove(file_name)
        
        return zip_filename

def upload_zip_to_s3(zip_filename:str):    # self
    try:
        # 1) upload zip file to s3
        Ec2Functions.upload_file(file_path=zip_filename, bucket_name='1b-bucket', object_key=zip_filename)

        # 2) remove local zip file
        os.remove(zip_filename)
    except Exception as ex:
        print('failed to upload to s3', ex)

api = HfApi()
def upload_zip_to_hf(zip_filename:str):
    try:
        # 1) upload zip file to huggingface
        api.upload_file(
        path_or_fileobj=zip_filename,
        path_in_repo=f'scrapy_engine/raw_chunks/{zip_filename}',
        repo_id="Aananda-giri/nepali_llm_datasets",
        repo_type="dataset",
        token=token
        )

        # 2) remove local zip file
        os.remove(zip_filename)
        
        return True # success
    except Exception as ex:
        print('failed to upload to hf', ex)
        return False

def get_pickles_size(pickle_dir: str = "./"):
    '''
        * sum of size of all .pickle files in Mb
    '''
    pickle_path = Path(pickle_dir)
    pickle_files = [
            f for f in pickle_path.glob("*.pickle")
            if not f.name.endswith("_temp.pickle")
        ]
    
    # Assuming 'pathlib.Path' object is used for pickle_path
    total_size = sum(f.stat().st_size / (1024 * 1024) for f in pickle_files)
    return total_size


# pseudocode
'''
* zip and upload:
    once every hour 
    or if pickles_size > 100MB  (check once every 5 minutes )
'''

start_time = time.time()

while True:
    time_elapsed = time.time() - start_time
    
    pickle_files_size_mb = get_pickles_size()
    if time_elapsed > 3600 or pickle_files_size_mb > 100:
        # 3600 seconds in one hour
        start_time = time.time()    # reset start_time
        
        zip_filename = zip_pickles()
        uploaded_to_hf = upload_zip_to_hf(zip_filename)
        if not uploaded_to_hf:
            upload_zip_to_s3(zip_filename)
        
        # delete zip file
        os.remove(zip_filename)
    
    # sleep for 5 minutes
    time.speep(5*60)

In [None]:
import threading

def __main__(self):
    '''
        main of worker spider in scrapy
    '''
    background_service = BackgroundUploadService()
    background_upload_thread = threading.Thread(target = background_service.run())
    background_upload_thread.daemon = True
    background_upload_thread.start()
