Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ The tool uses an iterative, stack-based, depth-first search approach with `os.sc
- **`scan_files_and_dirs(root_path, used_bytes, min_size)` in `zpace/core.py`**: The main driver function. It uses an iterative, stack-based approach with `os.scandir` to traverse the directory tree, handles special directories, and aggregates file/directory statistics.
- **`categorize_extension(extension)` in `zpace/core.py`**: Determines the category of a file based on its extension.
- **`identify_special_dir_name(dirname)` in `zpace/core.py`**: Checks if a directory is a "special" directory.
- **`calculate_dir_size_recursive(dirpath)` in `zpace/core.py`**: Recursively calculates the size of a directory. Used for "special directories" where we don't want to categorize individual files inside.
- **`calculate_dir_size(dirpath)` in `zpace/core.py`**: Iteratively calculates the size of a directory. Used for "special directories" where we don't want to categorize individual files inside. Replaces the recursive implementation to avoid stack overflow on deep directory structures.
- **`main()` in `zpace/main.py`**: Handles command-line argument parsing and orchestrates the scanning and printing of results.

## Configuration
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.

This project adheres to [Semantic Versioning](https://semver.org/).

## [0.4.1] - 2026-01-11

### Performance
- **Heap Sort**: Switched to `heapq.nlargest` for selecting top files, improving performance from $O(N \log N)$ to $O(N \log k)$.
- **Iterative Traversal**: Converted `calculate_dir_size` to use an iterative stack-based approach instead of recursion, preventing `RecursionError` in deeply nested directories.

## [0.4.0] - 2026-01-11

### Refactoring
Expand Down
14 changes: 7 additions & 7 deletions test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys

from zpace.core import (
calculate_dir_size_recursive,
calculate_dir_size,
categorize_extension,
get_top_n_per_category,
identify_special_dir_name,
Expand Down Expand Up @@ -144,20 +144,20 @@ def test_terabytes(self):
assert format_size(1024 * 1024 * 1024 * 1024) == "1.00 TB"


class TestCalculateDirSizeRecursive:
class TestCalculateDirSize:
"""Test directory size calculation."""

def test_empty_directory(self, fs):
fs.create_dir("/empty")
size = calculate_dir_size_recursive("/empty")
size = calculate_dir_size("/empty")
assert size == 0

def test_directory_with_files(self, fs):
fs.create_dir("/test")
fs.create_file("/test/file1.txt", contents="a" * 1000)
fs.create_file("/test/file2.txt", contents="b" * 2000)

size = calculate_dir_size_recursive("/test")
size = calculate_dir_size("/test")
# Should be at least the content size
assert size >= 3000

Expand All @@ -167,11 +167,11 @@ def test_nested_directories(self, fs):
fs.create_file("/test/root.txt", contents="root" * 100)
fs.create_file("/test/subdir/nested.txt", contents="nested" * 100)

size = calculate_dir_size_recursive("/test")
size = calculate_dir_size("/test")
assert size >= 1000

def test_nonexistent_directory(self):
size = calculate_dir_size_recursive("/nonexistent/directory/path")
size = calculate_dir_size("/nonexistent/directory/path")
assert size == 0

def test_directory_with_permission_error(self, fs, monkeypatch):
Expand All @@ -182,7 +182,7 @@ def mock_scandir(path):

monkeypatch.setattr("os.scandir", mock_scandir)

size = calculate_dir_size_recursive("/noaccess")
size = calculate_dir_size("/noaccess")
assert size == 0


Expand Down
50 changes: 28 additions & 22 deletions zpace/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import heapq
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple
Expand Down Expand Up @@ -37,28 +38,32 @@ def identify_special_dir_name(dirname: str) -> Optional[str]:
return SPECIAL_DIR_MAP.get(dirname.lower())


def calculate_dir_size_recursive(dirpath: str) -> int:
def calculate_dir_size(dirpath: str) -> int:
"""
Calculate total size of directory using os.scandir recursively.
Calculate total size of directory using os.scandir iteratively.
"""
total_size = 0
try:
with os.scandir(dirpath) as it:
for entry in it:
try:
if entry.is_file(follow_symlinks=False):
stat = entry.stat(follow_symlinks=False)
# st_blocks is 512-byte blocks. reliable on unix.
# fallback to st_size if not available (e.g. windows sometimes)
total_size += (
stat.st_blocks * 512 if hasattr(stat, "st_blocks") else stat.st_size
)
elif entry.is_dir(follow_symlinks=False):
total_size += calculate_dir_size_recursive(entry.path)
except (FileNotFoundError, PermissionError, OSError):
continue
except (FileNotFoundError, PermissionError, OSError):
pass
stack = [dirpath]

while stack:
current_path = stack.pop()
try:
with os.scandir(current_path) as it:
for entry in it:
try:
if entry.is_file(follow_symlinks=False):
stat = entry.stat(follow_symlinks=False)
# st_blocks is 512-byte blocks. reliable on unix.
# fallback to st_size if not available (e.g. windows sometimes)
total_size += (
stat.st_blocks * 512 if hasattr(stat, "st_blocks") else stat.st_size
)
elif entry.is_dir(follow_symlinks=False):
stack.append(entry.path)
except (FileNotFoundError, PermissionError, OSError):
continue
except (FileNotFoundError, PermissionError, OSError):
continue

return total_size

Expand Down Expand Up @@ -109,7 +114,7 @@ def scan_files_and_dirs(
special_type = identify_special_dir_name(dirname)
if special_type:
# Calculate size as atomic unit
dir_size = calculate_dir_size_recursive(entry_path)
dir_size = calculate_dir_size(entry_path)

if dir_size >= min_size:
# Storing string path instead of Path object
Expand Down Expand Up @@ -166,6 +171,7 @@ def get_top_n_per_category(
) -> Dict[str, List[Tuple[int, str]]]:
result = {}
for category, entries in categorized.items():
sorted_entries = sorted(entries, key=lambda x: x[0], reverse=True)
result[category] = sorted_entries[:top_n]
# Use heapq.nlargest for O(N log k) complexity instead of O(N log N)
top_entries = heapq.nlargest(top_n, entries, key=lambda x: x[0])
result[category] = top_entries
return result
4 changes: 2 additions & 2 deletions zpace/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from zpace.utils import get_disk_usage, format_size, get_trash_path
from zpace.core import (
calculate_dir_size_recursive,
calculate_dir_size,
scan_files_and_dirs,
get_top_n_per_category,
)
Expand Down Expand Up @@ -130,7 +130,7 @@ def main():
try:
# Verify we can actually list it (os.access might lie on some systems/containers)
next(os.scandir(trash_path), None)
trash_size = calculate_dir_size_recursive(trash_path)
trash_size = calculate_dir_size(trash_path)
additional_message = ""
if trash_size > 1000 * 1024 * 1024: # 1000 MB
additional_message = " (Consider cleanin up your trash bin!)"
Expand Down