AzisK · AzisK · Jan 17, 2026 · Jan 17, 2026 · Jan 17, 2026 · Jan 17, 2026
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -20,15 +20,17 @@ Certain directories are treated as single units rather than traversing their con
 
 ### 3. Scanning Algorithm
 The tool uses an iterative, stack-based, depth-first search approach with `os.scandir`. This is more performant than the previous `os.walk` implementation as it avoids the overhead of `os.walk` and creating `pathlib.Path` objects in performance-critical sections.
-- **Optimization**: System directories (e.g., `/proc`, `/sys`, `/System`) are skipped to improve performance and avoid permission errors.
+- **Optimization**: System directories (e.g., `/proc`, `/sys`, `/System`) are skipped to improve performance and avoid permission errors. The `DEEPEST_SKIP_LEVEL` optimization avoids unnecessary lookups when scanning deep paths where system directories cannot exist.
+- **Streaming Top-N**: Instead of collecting all files and then selecting the largest, the scanner maintains a fixed-size min-heap per category during traversal. This reduces memory from `O(files)` to `O(categories × top_n)` and avoids building large intermediate lists.
 - **Progress Tracking**: A `tqdm` progress bar shows real-time scanning progress based on bytes processed.
 
 ## Key Functions
 
-- **`scan_files_and_dirs(root_path, used_bytes, min_size)` in `zpace/core.py`**: The main driver function. It uses an iterative, stack-based approach with `os.scandir` to traverse the directory tree, handles special directories, and aggregates file/directory statistics.
+- **`scan_files_and_dirs(root_path, used_bytes, min_size, top_n)` in `zpace/core.py`**: The main driver function. It uses an iterative, stack-based approach with `os.scandir` to traverse the directory tree, handles special directories, and maintains min-heaps to track only the top N largest items per category during traversal.
 - **`categorize_extension(extension)` in `zpace/core.py`**: Determines the category of a file based on its extension.
 - **`identify_special_dir_name(dirname)` in `zpace/core.py`**: Checks if a directory is a "special" directory.
 - **`calculate_dir_size(dirpath)` in `zpace/core.py`**: Iteratively calculates the size of a directory. Used for "special directories" where we don't want to categorize individual files inside. Replaces the recursive implementation to avoid stack overflow on deep directory structures.
+- **`push_top_n(heap, item, n)` in `zpace/core.py`**: Maintains a min-heap of size `n` with the largest items. Used during scanning to keep only the top N files/directories per category without storing all matches.
 - **`main()` in `zpace/main.py`**: Handles command-line argument parsing and orchestrates the scanning and printing of results.
 
 ## Configuration

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
 
 This project adheres to [Semantic Versioning](https://semver.org/).
 
+## [0.4.2] - 2026-01-17
+
+### Performance
+- **Streaming Top-N**: Replaced post-scan `heapq.nlargest` with in-scan min-heap filtering. Memory usage is now `O(categories × top_n)` instead of `O(files_over_min_size)`, and large file lists are no longer built.
+
+### Code Quality
+- Removed unused `get_top_n_per_category` function (top-N logic now integrated into scan).
+- Added clarifying comment for `DEEPEST_SKIP_LEVEL` optimization.
+- Added tests for `push_top_n` heap helper and top-N integration behavior.
+
 ## [0.4.1] - 2026-01-11
 
 ### Performance

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "zpace"
-version = "0.4.1"
+version = "0.4.2"
 description = "A CLI tool to discover what's consuming your disk space"
 readme = "README.md"
 requires-python = ">=3.9"

diff --git a/test_unit.py b/test_unit.py
@@ -6,8 +6,8 @@
 from zpace.core import (
     calculate_dir_size,
     categorize_extension,
-    get_top_n_per_category,
     identify_special_dir_name,
+    push_top_n,
     scan_files_and_dirs,
     is_skip_path,
 )
@@ -186,36 +186,48 @@ def mock_scandir(path):
         assert size == 0
 
 
-class TestGetTopNPerCategory:
-    """Test top N selection by category."""
-
-    def test_empty_categories(self):
-        result = get_top_n_per_category({})
-        assert result == {}
-
-    def test_single_category(self):
-        test_data = {
-            "Documents": [
-                (1000, "small.doc"),
-                (5000, "medium.doc"),
-                (3000, "middle.doc"),
-            ]
-        }
-        result = get_top_n_per_category(test_data, top_n=2)
-        assert len(result["Documents"]) == 2
-        assert result["Documents"][0][0] == 5000  # Largest first
-        assert result["Documents"][1][0] == 3000
-
-    def test_multiple_categories(self):
-        test_data = {
-            "Documents": [(1000, "doc1.pdf"), (2000, "doc2.pdf")],
-            "Pictures": [(3000, "img1.jpg"), (4000, "img2.jpg")],
-        }
-        result = get_top_n_per_category(test_data, top_n=1)
-        assert len(result["Documents"]) == 1
-        assert len(result["Pictures"]) == 1
-        assert result["Documents"][0][0] == 2000
-        assert result["Pictures"][0][0] == 4000
+class TestPushTopN:
+    """Test the min-heap top-N helper function."""
+
+    def test_heap_not_full_adds_item(self):
+        heap = []
+        push_top_n(heap, (100, "/a.txt"), 3)
+        push_top_n(heap, (200, "/b.txt"), 3)
+        assert len(heap) == 2
+        assert (100, "/a.txt") in heap
+        assert (200, "/b.txt") in heap
+
+    def test_heap_full_rejects_smaller(self):
+        heap = [(100, "/a.txt"), (200, "/b.txt"), (300, "/c.txt")]
+        push_top_n(heap, (50, "/small.txt"), 3)
+        assert len(heap) == 3
+        assert (50, "/small.txt") not in heap
+
+    def test_heap_full_accepts_larger(self):
+        heap = []
+        push_top_n(heap, (100, "/a.txt"), 3)
+        push_top_n(heap, (200, "/b.txt"), 3)
+        push_top_n(heap, (300, "/c.txt"), 3)
+        push_top_n(heap, (500, "/large.txt"), 3)
+        assert len(heap) == 3
+        assert (100, "/a.txt") not in heap
+        assert (500, "/large.txt") in heap
+
+    def test_heap_maintains_top_n_largest(self):
+        heap = []
+        sizes = [50, 300, 100, 500, 200, 400, 150]
+        for i, size in enumerate(sizes):
+            push_top_n(heap, (size, f"/{i}.txt"), 3)
+        sorted_heap = sorted(heap, reverse=True)
+        assert [s for s, _ in sorted_heap] == [500, 400, 300]
+
+    def test_heap_size_one(self):
+        heap = []
+        push_top_n(heap, (100, "/a.txt"), 1)
+        push_top_n(heap, (200, "/b.txt"), 1)
+        push_top_n(heap, (50, "/c.txt"), 1)
+        assert len(heap) == 1
+        assert heap[0] == (200, "/b.txt")
 
 
 class TestScanFilesAndDirs:
@@ -427,6 +439,56 @@ def test_mixed_file_types_and_sizes(self, mock_tqdm, fs):
         # At least some files should be categorized
         assert len(file_cats) > 0
 
+    @patch("zpace.core.tqdm")
+    def test_top_n_limits_results_per_category(self, mock_tqdm, fs):
+        """Test that top_n limits results and returns largest items sorted descending."""
+        mock_pbar = MagicMock()
+        mock_tqdm.return_value.__enter__.return_value = mock_pbar
+
+        # Create 5 documents with varying sizes
+        fs.create_file("/test/doc1.pdf", contents="x" * (MIN_FILE_SIZE + 1000))
+        fs.create_file("/test/doc2.pdf", contents="x" * (MIN_FILE_SIZE + 5000))
+        fs.create_file("/test/doc3.pdf", contents="x" * (MIN_FILE_SIZE + 3000))
+        fs.create_file("/test/doc4.pdf", contents="x" * (MIN_FILE_SIZE + 4000))
+        fs.create_file("/test/doc5.pdf", contents="x" * (MIN_FILE_SIZE + 2000))
+
+        file_cats, dir_cats, file_count, total_size = scan_files_and_dirs(
+            Path("/test"), used_bytes=100000000, min_size=MIN_FILE_SIZE, top_n=2
+        )
+
+        # Should only have 2 documents (top_n=2)
+        assert len(file_cats["Documents"]) == 2
+        # Should be sorted descending (largest first)
+        sizes = [size for size, _ in file_cats["Documents"]]
+        assert sizes[0] > sizes[1]
+        # Should contain the two largest
+        assert sizes[0] >= MIN_FILE_SIZE + 4000
+        assert sizes[1] >= MIN_FILE_SIZE + 4000
+
+    @patch("zpace.core.tqdm")
+    def test_top_n_multiple_categories(self, mock_tqdm, fs):
+        """Test that top_n applies independently to each category."""
+        mock_pbar = MagicMock()
+        mock_tqdm.return_value.__enter__.return_value = mock_pbar
+
+        # Create files in multiple categories
+        fs.create_file("/test/doc1.pdf", contents="x" * (MIN_FILE_SIZE + 1000))
+        fs.create_file("/test/doc2.pdf", contents="x" * (MIN_FILE_SIZE + 2000))
+        fs.create_file("/test/img1.jpg", contents="x" * (MIN_FILE_SIZE + 3000))
+        fs.create_file("/test/img2.jpg", contents="x" * (MIN_FILE_SIZE + 4000))
+        fs.create_file("/test/img3.jpg", contents="x" * (MIN_FILE_SIZE + 5000))
+
+        file_cats, dir_cats, file_count, total_size = scan_files_and_dirs(
+            Path("/test"), used_bytes=100000000, min_size=MIN_FILE_SIZE, top_n=1
+        )
+
+        # Each category should have only 1 item (top_n=1)
+        assert len(file_cats["Documents"]) == 1
+        assert len(file_cats["Pictures"]) == 1
+        # Each should be the largest in its category
+        assert file_cats["Documents"][0][0] >= MIN_FILE_SIZE + 2000
+        assert file_cats["Pictures"][0][0] >= MIN_FILE_SIZE + 5000
+
     @pytest.mark.skipif(sys.platform == "win32", reason="Test specific to Unix-like systems")
     @patch("zpace.core.tqdm")
     @patch("zpace.core.is_skip_path")

diff --git a/uv.lock b/uv.lock
diff --git a/zpace/config.py b/zpace/config.py
@@ -23,6 +23,9 @@
     "/.fseventsd",
 }
 
+# SKIP_DIRS contains only root-level system paths (e.g., /dev, /proc, /System).
+# We only check against SKIP_DIRS when level <= DEEPEST_SKIP_LEVEL as an optimization:
+# deeper scans (e.g., /home/user/project) can never encounter these paths.
 DEEPEST_SKIP_LEVEL = 3
 
 CATEGORIES = {

diff --git a/zpace/core.py b/zpace/core.py
@@ -22,6 +22,14 @@ def categorize_extension(extension: str) -> str:
     return EXTENSION_MAP.get(extension.lower(), "Others")
 
 
+def push_top_n(heap: List[Tuple[int, str]], item: Tuple[int, str], n: int) -> None:
+    """Maintain a min-heap of size n with the largest items."""
+    if len(heap) < n:
+        heapq.heappush(heap, item)
+    elif item[0] > heap[0][0]:
+        heapq.heapreplace(heap, item)
+
+
 def is_skip_path(dirpath: str) -> bool:
     """Check if directory path should be skipped (system directories)."""
     return dirpath in SKIP_DIRS
@@ -69,14 +77,18 @@ def calculate_dir_size(dirpath: str) -> int:
 
 
 def scan_files_and_dirs(
-    root_path: Path, used_bytes: int, min_size: int = MIN_FILE_SIZE
+    root_path: Path,
+    used_bytes: int,
+    min_size: int = MIN_FILE_SIZE,
+    top_n: int = DEFAULT_TOP_N,
 ) -> Tuple[Dict[str, List[Tuple[int, str]]], Dict[str, List[Tuple[int, str]]], int, int]:
     """
     Scan directory tree for files and special directories using an iterative stack with os.scandir.
+    Uses min-heaps to keep only top_n largest items per category, reducing memory from O(files) to O(categories * top_n).
     Returns: (file_categories, dir_categories, total_files, total_size)
     """
-    file_categories = defaultdict(list)
-    dir_categories = defaultdict(list)
+    file_heaps: Dict[str, List[Tuple[int, str]]] = defaultdict(list)
+    dir_heaps: Dict[str, List[Tuple[int, str]]] = defaultdict(list)
     scanned_files = 0
     scanned_size = 0
     progress_update_buffer = 0
@@ -117,8 +129,9 @@ def scan_files_and_dirs(
                                     dir_size = calculate_dir_size(entry_path)
 
                                     if dir_size >= min_size:
-                                        # Storing string path instead of Path object
-                                        dir_categories[special_type].append((dir_size, entry_path))
+                                        push_top_n(
+                                            dir_heaps[special_type], (dir_size, entry_path), top_n
+                                        )
 
                                     scanned_size += dir_size
                                     progress_update_buffer += dir_size
@@ -139,7 +152,7 @@ def scan_files_and_dirs(
                                 if size >= min_size:
                                     _, ext = os.path.splitext(entry.name)
                                     category = categorize_extension(ext)
-                                    file_categories[category].append((size, entry.path))
+                                    push_top_n(file_heaps[category], (size, entry.path), top_n)
 
                                 scanned_files += 1
                                 scanned_size += size
@@ -163,15 +176,8 @@ def scan_files_and_dirs(
         if progress_update_buffer > 0:
             pbar.update(progress_update_buffer)
 
-    return dict(file_categories), dict(dir_categories), scanned_files, scanned_size
-
+    # Convert heaps to sorted lists (descending by size)
+    file_categories = {cat: sorted(heap, reverse=True) for cat, heap in file_heaps.items()}
+    dir_categories = {cat: sorted(heap, reverse=True) for cat, heap in dir_heaps.items()}
 
-def get_top_n_per_category(
-    categorized: Dict[str, List[Tuple[int, str]]], top_n: int = DEFAULT_TOP_N
-) -> Dict[str, List[Tuple[int, str]]]:
-    result = {}
-    for category, entries in categorized.items():
-        # Use heapq.nlargest for O(N log k) complexity instead of O(N log N)
-        top_entries = heapq.nlargest(top_n, entries, key=lambda x: x[0])
-        result[category] = top_entries
-    return result
+    return file_categories, dir_categories, scanned_files, scanned_size
diff --git a/zpace/main.py b/zpace/main.py
@@ -14,7 +14,6 @@
 from zpace.core import (
     calculate_dir_size,
     scan_files_and_dirs,
-    get_top_n_per_category,
 )
 
 
@@ -133,7 +132,7 @@ def main():
                     trash_size = calculate_dir_size(trash_path)
                     additional_message = ""
                     if trash_size > 1000 * 1024 * 1024:  # 1000 MB
-                        additional_message = " (Consider cleanin up your trash bin!)"
+                        additional_message = " (Consider cleaning up your trash bin!)"
                     print(f"  Trash: {format_size(trash_size)}{additional_message}")
                 except PermissionError:
                     print("  Trash: Access Denied")
@@ -157,10 +156,9 @@ def main():
         print(f"If you wish to analyse the symlinked directory, please pass its path: {resolved}")
         return
 
-    # Scan files and directories
     try:
-        file_cats, dir_cats, total_files, total_size = scan_files_and_dirs(
-            scan_path, used, args.min_size * 1024
+        top_files, top_dirs, total_files, total_size = scan_files_and_dirs(
+            scan_path, used, args.min_size * 1024, top_n=args.top
         )
     except KeyboardInterrupt:
         print("\nScan interrupted by user")
@@ -169,14 +167,10 @@ def main():
         print(f"Error during scan: {e}")
         sys.exit(1)
 
-    # Get top N for each category
-    top_files = get_top_n_per_category(file_cats, top_n=args.top)
-    top_dirs = get_top_n_per_category(dir_cats, top_n=args.top)
-
     # Display results
     print("\nSCAN COMPLETE!")
     print(f"   Found {total_files:,} files")
-    print(f"   Found {sum(len(e) for e in dir_cats.values())} special directories")
+    print(f"   Found {sum(len(e) for e in top_dirs.values())} special directories")
     print(f"   Total size: {format_size(total_size)}")
 
     print_results(top_files, top_dirs, terminal_width)