# core

> Utility for automating backups of a specific file or directory

In [None]:
#| default_exp core

# Automated Backups

We want a script to back up a specific file/folder over different intervals. Specifically, it should

- Copy to some destination dir every hour (e.g. a different drive)
- Keep the last 5, and one every day, week and month (for example)

We can then rsync the destination dir to keep a remote backup.

In [None]:
#|export
import shutil, os, time, pprint, logging
from pathlib import Path
from fastcore.script import call_parse
from fastcore.xtras import globtastic
from datetime import datetime, timedelta

In [None]:
!mkdir -p demo_src
!mkdir -p demo_dst
!rm -r demo_dst/*
!rm -r demo_src/*

In [None]:
!echo "content" > "demo_src/test_text.txt"
!echo "## content" > "demo_src/test_two.md"

## The core functionality

The plan has two main steps:

- Create a new backup
- Clean up any old backups that are no longer needed.


For step 1 we want to go file by file in case of errors, and support a matching pattern for what to include. So, take 2:

In [None]:
globtastic("demo_src", file_glob="*.md") # Finding files

(#1) ['demo_src/test_two.md']

In [None]:
#| export
def create_backup(src, dest_dir, pattern=None, skip_pattern=None):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    src_path = Path(src)
    dest_path = Path(dest_dir) / timestamp
    dest_path.mkdir(parents=True, exist_ok=True)
    
    if src_path.is_file():
        files_to_copy = [src_path]
    else:
        if pattern or skip_pattern:
            files_to_copy = globtastic(src_path, file_glob=pattern, skip_file_glob=skip_pattern)
            files_to_copy = [Path(f) for f in files_to_copy]
        else: files_to_copy = src_path.rglob('*')
    
    for file in files_to_copy:
        if file.is_file():
            rel_path = file.relative_to(src_path)
            dest_file = dest_path / rel_path
            dest_file.parent.mkdir(parents=True, exist_ok=True)
            try:
                shutil.copy2(file, dest_file)
            except Exception as e:
                logging.warning(f"Failed to copy {file}: {e}")

In [None]:
create_backup('demo_src', 'demo_dst')
!ls demo_dst

20241127_143724


In [None]:
!ls demo_dst/20241127_143724

test_text.txt  test_two.md


In [None]:
# Test single file
create_backup('demo_src/test_text.txt', 'demo_dst')
!ls demo_dst

20241127_143724  20241127_143733


In [None]:
!ls demo_dst/20241127_143733/

test_text.txt


In [None]:
# Test pattern
create_backup('demo_src', 'demo_dst', pattern='*.md')
!ls demo_dst

20241127_143724  20241127_143733  20241127_143739


In [None]:
!ls demo_dst/20241127_143739/

test_two.md


In [None]:
# Test skip_pattern
create_backup('demo_src', 'demo_dst', skip_pattern='*.md')
!ls demo_dst

20241127_143724  20241127_143733  20241127_143739  20241127_143742


In [None]:
!ls demo_dst/20241127_143742/

test_text.txt


The harder part is the cleanup. Let's start by generating some dates to test with.

In [None]:
def generate_test_dates(num_dates, base_date):
    return [(base_date + timedelta(hours=i)).strftime("%Y%m%d_%H%M%S") for i in range(num_dates)]
test_dates = generate_test_dates(2400, datetime.now() - timedelta(days=100))
print(test_dates[:5], test_dates[-5:])

['20240819_143748', '20240819_153748', '20240819_163748', '20240819_173748', '20240819_183748'] ['20241127_093748', '20241127_103748', '20241127_113748', '20241127_123748', '20241127_133748']


In [None]:
# Can I get all dates < 2 months old?
[d for d in test_dates if (datetime.now() - datetime.strptime(d, '%Y%m%d_%H%M%S')).days < 60][:3]

['20240928_153748', '20240928_163748', '20240928_173748']

Now we want to grab the most recent 5, and then the oldest below some threshold.

In [None]:
#| export 
def clean_dates(dates, now=None, max_ages=(2, 14, 60)):
    now = now or datetime.now()
    clean = []
    dates.sort()
    
    for max_age in max_ages:
        lt_max = [d for d in dates if (now - datetime.strptime(d, '%Y%m%d_%H%M%S')).days < max_age]
        if lt_max: clean.append(lt_max[0])

    clean.extend(dates[-5:])  # Keep the newest 5
    return sorted(set(clean))  # Remove duplicates and sort

In [None]:
clean_dates(test_dates)

['20240928_153748',
 '20241113_153748',
 '20241125_153748',
 '20241127_093748',
 '20241127_103748',
 '20241127_113748',
 '20241127_123748',
 '20241127_133748']

Now we want code that starts with the same test dates etc as above, but then simulates time passing by adding an hour to 'now' and a date to test dates every step then printing out a (prettified) version of clean_dates to check it's doing as I expect over a simulated month.

In [None]:
# Initialize
now = datetime.now()
test_dates = generate_test_dates(2400, now - timedelta(days=100))

# Simulate time passing
for _ in range(30 * 24):  # Simulate a month (30 days * 24 hours)
    now += timedelta(hours=1)
    test_dates.append(now.strftime("%Y%m%d_%H%M%S"))
    test_dates = clean_dates(test_dates, now)  # Clean up old dates
    if _ % 24 == 0:  # Print once a day
        print(f"\nDay {_ // 24 + 1}:")
        pprint.pprint(test_dates)


Day 1:
['20240928_163752',
 '20241113_163752',
 '20241125_163752',
 '20241127_103752',
 '20241127_113752',
 '20241127_123752',
 '20241127_133752',
 '20241127_153752']

Day 2:
['20241113_163752',
 '20241125_163752',
 '20241127_103752',
 '20241128_113752',
 '20241128_123752',
 '20241128_133752',
 '20241128_143752',
 '20241128_153752']

Day 3:
['20241113_163752',
 '20241125_163752',
 '20241129_053752',
 '20241129_113752',
 '20241129_123752',
 '20241129_133752',
 '20241129_143752',
 '20241129_153752']

Day 4:
['20241113_163752',
 '20241125_163752',
 '20241129_053752',
 '20241130_113752',
 '20241130_123752',
 '20241130_133752',
 '20241130_143752',
 '20241130_153752']

Day 5:
['20241113_163752',
 '20241125_163752',
 '20241201_003752',
 '20241201_113752',
 '20241201_123752',
 '20241201_133752',
 '20241201_143752',
 '20241201_153752']

Day 6:
['20241113_163752',
 '20241125_163752',
 '20241201_003752',
 '20241202_113752',
 '20241202_123752',
 '20241202_133752',
 '20241202_143752',
 '20241202_1

NB: Yay, it looks to be doing mostly what I want! I can collapse the output, if you're viewing this in a notebook my apologies :)

## Turning it into a script

Now that those two pieces of functionality seem to be working, we can wrap this up as a script using fastcore's call_parse, have it run the backup, clean up old files, and log any errors or messages to backup.log

In [None]:
#|export
@call_parse
def run_backup(
    src:str, # The source to be backed up
    dest:str, # The destination directory
    max_ages:str="2,14,60", # The max age(s) in days for the different backups
    log_file:str='backup.log',
    pattern:str=None, # Globtastic file_glob pattern
    skip_pattern:str=None # Globtastic skip_file_glob pattern
):
    "Run backup and cleanup old files"
    
    # Set up logging
    logging.basicConfig(filename=log_file, level=logging.DEBUG,
                        format='%(asctime)s - %(levelname)s - %(message)s')
    try:
        create_backup(src, dest, pattern=pattern, skip_pattern=skip_pattern)
        logging.info(f"Backup created: {src} -> {dest}")
    except Exception as e:
        logging.error(f"Backup failed: {str(e)}", exc_info=True)
    finally:
        max_ages = [int(age.strip()) for age in max_ages.split(',')]
        backups = [d.name for d in Path(dest).iterdir() if d.is_dir()]
        to_keep = clean_dates(backups, max_ages=max_ages)
        for backup in backups:
            if backup not in to_keep:
                try:
                    shutil.rmtree(Path(dest) / backup)
                    logging.info(f"Removed old backup: {backup}")
                except Exception as e:
                    logging.error(f"Removing old backup failed: {str(e)}", exc_info=True)

In [None]:
!ls demo_src

test_text.txt  test_two.md


Testing a directory:

In [None]:
!rm -r demo_dst/*

In [None]:
run_backup('demo_src', 'demo_dst',)
!ls demo_dst

20241127_143942


In [None]:
!ls demo_dst/20241127_143942

test_text.txt  test_two.md


Testing a pattern

In [None]:
!rm -r demo_dst/*

In [None]:
run_backup('demo_src', 'demo_dst', skip_pattern="*.md")

In [None]:
!ls demo_dst

20241127_144023


In [None]:
!ls demo_dst/20241127_144023

test_text.txt
