diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index b97723d..0727600 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -28,7 +28,7 @@ The tool uses an iterative, stack-based, depth-first search approach with `os.sc - **`scan_files_and_dirs(root_path, used_bytes, min_size)` in `zpace/core.py`**: The main driver function. It uses an iterative, stack-based approach with `os.scandir` to traverse the directory tree, handles special directories, and aggregates file/directory statistics. - **`categorize_extension(extension)` in `zpace/core.py`**: Determines the category of a file based on its extension. - **`identify_special_dir_name(dirname)` in `zpace/core.py`**: Checks if a directory is a "special" directory. -- **`calculate_dir_size_recursive(dirpath)` in `zpace/core.py`**: Recursively calculates the size of a directory. Used for "special directories" where we don't want to categorize individual files inside. +- **`calculate_dir_size(dirpath)` in `zpace/core.py`**: Iteratively calculates the size of a directory. Used for "special directories" where we don't want to categorize individual files inside. Replaces the recursive implementation to avoid stack overflow on deep directory structures. - **`main()` in `zpace/main.py`**: Handles command-line argument parsing and orchestrates the scanning and printing of results. ## Configuration diff --git a/CHANGELOG.md b/CHANGELOG.md index f583503..370a18a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/). +## [0.4.1] - 2026-01-11 + +### Performance +- **Heap Sort**: Switched to `heapq.nlargest` for selecting top files, improving performance from $O(N \log N)$ to $O(N \log k)$. +- **Iterative Traversal**: Converted `calculate_dir_size` to use an iterative stack-based approach instead of recursion, preventing `RecursionError` in deeply nested directories. + ## [0.4.0] - 2026-01-11 ### Refactoring diff --git a/test_unit.py b/test_unit.py index ebe42d7..7267bb2 100644 --- a/test_unit.py +++ b/test_unit.py @@ -4,7 +4,7 @@ import sys from zpace.core import ( - calculate_dir_size_recursive, + calculate_dir_size, categorize_extension, get_top_n_per_category, identify_special_dir_name, @@ -144,12 +144,12 @@ def test_terabytes(self): assert format_size(1024 * 1024 * 1024 * 1024) == "1.00 TB" -class TestCalculateDirSizeRecursive: +class TestCalculateDirSize: """Test directory size calculation.""" def test_empty_directory(self, fs): fs.create_dir("/empty") - size = calculate_dir_size_recursive("/empty") + size = calculate_dir_size("/empty") assert size == 0 def test_directory_with_files(self, fs): @@ -157,7 +157,7 @@ def test_directory_with_files(self, fs): fs.create_file("/test/file1.txt", contents="a" * 1000) fs.create_file("/test/file2.txt", contents="b" * 2000) - size = calculate_dir_size_recursive("/test") + size = calculate_dir_size("/test") # Should be at least the content size assert size >= 3000 @@ -167,11 +167,11 @@ def test_nested_directories(self, fs): fs.create_file("/test/root.txt", contents="root" * 100) fs.create_file("/test/subdir/nested.txt", contents="nested" * 100) - size = calculate_dir_size_recursive("/test") + size = calculate_dir_size("/test") assert size >= 1000 def test_nonexistent_directory(self): - size = calculate_dir_size_recursive("/nonexistent/directory/path") + size = calculate_dir_size("/nonexistent/directory/path") assert size == 0 def test_directory_with_permission_error(self, fs, monkeypatch): @@ -182,7 +182,7 @@ def mock_scandir(path): monkeypatch.setattr("os.scandir", mock_scandir) - size = calculate_dir_size_recursive("/noaccess") + size = calculate_dir_size("/noaccess") assert size == 0 diff --git a/zpace/core.py b/zpace/core.py index 9730a0d..8b625ce 100644 --- a/zpace/core.py +++ b/zpace/core.py @@ -1,4 +1,5 @@ import os +import heapq from collections import defaultdict from pathlib import Path from typing import Dict, List, Optional, Tuple @@ -37,28 +38,32 @@ def identify_special_dir_name(dirname: str) -> Optional[str]: return SPECIAL_DIR_MAP.get(dirname.lower()) -def calculate_dir_size_recursive(dirpath: str) -> int: +def calculate_dir_size(dirpath: str) -> int: """ - Calculate total size of directory using os.scandir recursively. + Calculate total size of directory using os.scandir iteratively. """ total_size = 0 - try: - with os.scandir(dirpath) as it: - for entry in it: - try: - if entry.is_file(follow_symlinks=False): - stat = entry.stat(follow_symlinks=False) - # st_blocks is 512-byte blocks. reliable on unix. - # fallback to st_size if not available (e.g. windows sometimes) - total_size += ( - stat.st_blocks * 512 if hasattr(stat, "st_blocks") else stat.st_size - ) - elif entry.is_dir(follow_symlinks=False): - total_size += calculate_dir_size_recursive(entry.path) - except (FileNotFoundError, PermissionError, OSError): - continue - except (FileNotFoundError, PermissionError, OSError): - pass + stack = [dirpath] + + while stack: + current_path = stack.pop() + try: + with os.scandir(current_path) as it: + for entry in it: + try: + if entry.is_file(follow_symlinks=False): + stat = entry.stat(follow_symlinks=False) + # st_blocks is 512-byte blocks. reliable on unix. + # fallback to st_size if not available (e.g. windows sometimes) + total_size += ( + stat.st_blocks * 512 if hasattr(stat, "st_blocks") else stat.st_size + ) + elif entry.is_dir(follow_symlinks=False): + stack.append(entry.path) + except (FileNotFoundError, PermissionError, OSError): + continue + except (FileNotFoundError, PermissionError, OSError): + continue return total_size @@ -109,7 +114,7 @@ def scan_files_and_dirs( special_type = identify_special_dir_name(dirname) if special_type: # Calculate size as atomic unit - dir_size = calculate_dir_size_recursive(entry_path) + dir_size = calculate_dir_size(entry_path) if dir_size >= min_size: # Storing string path instead of Path object @@ -166,6 +171,7 @@ def get_top_n_per_category( ) -> Dict[str, List[Tuple[int, str]]]: result = {} for category, entries in categorized.items(): - sorted_entries = sorted(entries, key=lambda x: x[0], reverse=True) - result[category] = sorted_entries[:top_n] + # Use heapq.nlargest for O(N log k) complexity instead of O(N log N) + top_entries = heapq.nlargest(top_n, entries, key=lambda x: x[0]) + result[category] = top_entries return result diff --git a/zpace/main.py b/zpace/main.py index d4426dd..e407f4e 100644 --- a/zpace/main.py +++ b/zpace/main.py @@ -12,7 +12,7 @@ ) from zpace.utils import get_disk_usage, format_size, get_trash_path from zpace.core import ( - calculate_dir_size_recursive, + calculate_dir_size, scan_files_and_dirs, get_top_n_per_category, ) @@ -130,7 +130,7 @@ def main(): try: # Verify we can actually list it (os.access might lie on some systems/containers) next(os.scandir(trash_path), None) - trash_size = calculate_dir_size_recursive(trash_path) + trash_size = calculate_dir_size(trash_path) additional_message = "" if trash_size > 1000 * 1024 * 1024: # 1000 MB additional_message = " (Consider cleanin up your trash bin!)"