diff --git a/b2sdk/_internal/scan/folder.py b/b2sdk/_internal/scan/folder.py index 653df4d2..0be2ec87 100644 --- a/b2sdk/_internal/scan/folder.py +++ b/b2sdk/_internal/scan/folder.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import Iterator -from ..utils import fix_windows_path_limit, get_file_mtime, is_file_readable, validate_b2_file_name +from ..utils import fix_windows_path_limit, get_file_mtime, validate_b2_file_name from .exception import ( EmptyDirectory, EnvironmentEncodingError, @@ -219,32 +219,24 @@ def _walk_relative_paths( # a0.txt # # This is because in Unicode '.' comes before '/', which comes before '0'. - names = [] # list of (name, local_path, relative_file_path) - visited_symlinks = visited_symlinks or set() if local_dir.is_symlink(): - real_path = local_dir.resolve() - inode_number = real_path.stat().st_ino - - visited_symlinks_count = len(visited_symlinks) - - # Add symlink to visited_symlinks to prevent infinite symlink loops - visited_symlinks.add(inode_number) - - # Check if set size has changed, if not, symlink has already been visited - if len(visited_symlinks) == visited_symlinks_count: - # Infinite symlink loop detected, report warning and skip symlink - if reporter is not None: + inode_number = local_dir.resolve().stat().st_ino + if inode_number in visited_symlinks: + if reporter: reporter.circular_symlink_skipped(str(local_dir)) - return - + return # Skip if symlink already visited visited_symlinks.add(inode_number) - for local_path in local_dir.iterdir(): + for local_path in sorted(local_dir.iterdir()): name = local_path.name relative_file_path = join_b2_path(relative_dir_path, name) + if policies_manager.exclude_all_symlinks and local_path.is_symlink(): + if reporter is not None: + reporter.symlink_skipped(str(local_path)) + continue try: validate_b2_file_name(name) except ValueError as e: @@ -252,57 +244,39 @@ def _walk_relative_paths( reporter.invalid_name(str(local_path), str(e)) continue - # Skip broken symlinks or other inaccessible files - if not is_file_readable(str(local_path), reporter): - continue - - if policies_manager.exclude_all_symlinks and local_path.is_symlink(): - if reporter is not None: - reporter.symlink_skipped(str(local_path)) - continue - if local_path.is_dir(): - name += '/' if policies_manager.should_exclude_local_directory(str(relative_file_path)): - continue - - # remove the leading './' from the relative path to ensure backward compatibility - relative_file_path_str = str(relative_file_path) - if relative_file_path_str.startswith("./"): - relative_file_path_str = relative_file_path_str[2:] - names.append((name, local_path, relative_file_path_str)) - - # Yield all of the answers. - # - # Sorting the list of triples puts them in the right order because 'name', - # the sort key, is the first thing in the triple. - for (name, local_path, relative_file_path) in sorted(names): - if name.endswith('/'): + continue # Skip excluded directories + # Recurse into directories yield from self._walk_relative_paths( - local_path, - relative_file_path, - reporter, - policies_manager, - visited_symlinks, + local_path, relative_file_path, reporter, policies_manager, visited_symlinks ) else: - # Check that the file still exists and is accessible, since it can take a long time - # to iterate through large folders - if is_file_readable(str(local_path), reporter): + if policies_manager.should_exclude_relative_path(relative_file_path): + continue # Skip excluded files + try: file_mod_time = get_file_mtime(str(local_path)) file_size = local_path.stat().st_size + except OSError: + if reporter is not None: + reporter.local_access_error(str(local_path)) + continue - local_scan_path = LocalPath( - absolute_path=self.make_full_path(str(relative_file_path)), - relative_path=str(relative_file_path), - mod_time=file_mod_time, - size=file_size, - ) + local_scan_path = LocalPath( + absolute_path=self.make_full_path(str(relative_file_path)), + relative_path=str(relative_file_path), + mod_time=file_mod_time, + size=file_size + ) + if policies_manager.should_exclude_local_path(local_scan_path): + continue # Skip excluded files - if policies_manager.should_exclude_local_path(local_scan_path): + if not os.access(local_path, os.R_OK): + if reporter is not None: + reporter.local_permission_error(str(local_path)) continue - yield local_scan_path + yield local_scan_path @classmethod def _handle_non_unicode_file_name(cls, name): diff --git a/b2sdk/_internal/scan/policies.py b/b2sdk/_internal/scan/policies.py index 6befa857..93921cb5 100644 --- a/b2sdk/_internal/scan/policies.py +++ b/b2sdk/_internal/scan/policies.py @@ -184,7 +184,7 @@ def __init__( exclude_uploaded_before, exclude_uploaded_after ) - def _should_exclude_relative_path(self, relative_path: str): + def should_exclude_relative_path(self, relative_path: str): if self._include_file_set.matches(relative_path): return False return self._exclude_file_set.matches(relative_path) @@ -197,7 +197,7 @@ def should_exclude_local_path(self, local_path: LocalPath): """ if local_path.mod_time not in self._include_mod_time_range: return True - return self._should_exclude_relative_path(local_path.relative_path) + return self.should_exclude_relative_path(local_path.relative_path) def should_exclude_b2_file_version(self, file_version: FileVersion, relative_path: str): """ @@ -209,7 +209,7 @@ def should_exclude_b2_file_version(self, file_version: FileVersion, relative_pat return True if file_version.mod_time_millis not in self._include_mod_time_range: return True - return self._should_exclude_relative_path(relative_path) + return self.should_exclude_relative_path(relative_path) def should_exclude_b2_directory(self, dir_path: str): """ diff --git a/b2sdk/_internal/utils/__init__.py b/b2sdk/_internal/utils/__init__.py index a06f4efd..f4ae98ce 100644 --- a/b2sdk/_internal/utils/__init__.py +++ b/b2sdk/_internal/utils/__init__.py @@ -252,26 +252,6 @@ def validate_b2_file_name(name): raise ValueError("file names segments (between '/') can be at most 250 utf-8 bytes") -def is_file_readable(local_path, reporter=None): - """ - Check if the local file has read permissions. - - :param local_path: a file path - :type local_path: str - :param reporter: reporter object to put errors on - :rtype: bool - """ - if not os.path.exists(local_path): - if reporter is not None: - reporter.local_access_error(local_path) - return False - elif not os.access(local_path, os.R_OK): - if reporter is not None: - reporter.local_permission_error(local_path) - return False - return True - - def get_file_mtime(local_path): """ Get modification time of a file in milliseconds. diff --git a/b2sdk/v1/sync/scan_policies.py b/b2sdk/v1/sync/scan_policies.py index a287bfe0..d94c78c0 100644 --- a/b2sdk/v1/sync/scan_policies.py +++ b/b2sdk/v1/sync/scan_policies.py @@ -134,6 +134,9 @@ def __init__(self, scan_policies_manager: ScanPoliciesManager): def __repr__(self): return f"{self.__class__.__name__}({self.scan_policies_manager})" + def should_exclude_relative_path(self, relative_path: str): + self.scan_policies_manager.should_exclude_file(relative_path) + def should_exclude_local_path(self, local_path: v2.LocalSyncPath): if self.scan_policies_manager.should_exclude_file_version( _translate_local_path_to_file(local_path).latest_version() diff --git a/changelog.d/456.fixed.md b/changelog.d/456.fixed.md new file mode 100644 index 00000000..a88e1010 --- /dev/null +++ b/changelog.d/456.fixed.md @@ -0,0 +1 @@ +Move scan filters before a read on filesystem access attempt. This will prevent unnecessary warnings and IO operations on paths that are not relevant to the operation. \ No newline at end of file diff --git a/test/unit/scan/test_folder_traversal.py b/test/unit/scan/test_folder_traversal.py index bd52a8d4..d791dcd0 100644 --- a/test/unit/scan/test_folder_traversal.py +++ b/test/unit/scan/test_folder_traversal.py @@ -14,7 +14,7 @@ import platform import re import sys -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -654,3 +654,71 @@ def test_folder_all_files__dir_excluded_by_regex(self, tmp_path): assert absolute_paths == [ fix_windows_path_limit(str(d1_dir / "file1.txt")), ] + + def test_excluded_no_access_check(self, tmp_path): + """Test that a directory/file is not checked for access if it is excluded.""" + # Create directories and files + excluded_dir = tmp_path / "excluded_dir" + excluded_dir.mkdir() + excluded_file = excluded_dir / "excluded_file.txt" + excluded_file.touch() + included_dir = tmp_path / "included_dir" + included_dir.mkdir() + (included_dir / "excluded_file.txt").touch() + + # Setup exclusion regex that matches the excluded directory/file name + scan_policy = ScanPoliciesManager( + exclude_dir_regexes=[r"excluded_dir$"], exclude_file_regexes=[r'.*excluded_file.txt'] + ) + reporter = ProgressReport(sys.stdout, False) + + # Patch os.access to monitor if it is called on the excluded file + with patch('os.access', MagicMock(return_value=True)) as mocked_access: + folder = LocalFolder(str(tmp_path)) + list(folder.all_files(reporter=reporter, policies_manager=scan_policy)) + + # Verify os.access was not called for the excluded directory/file + mocked_access.assert_not_called() + + reporter.close() + + def test_excluded_without_permissions(self, tmp_path): + """Test that a excluded directory/file without permissions is not processed and no warning is issued.""" + excluded_dir = tmp_path / "excluded_dir" + excluded_dir.mkdir() + (excluded_dir / "file.txt").touch() + + included_dir = tmp_path / "included_dir" + included_dir.mkdir() + (included_dir / "excluded_file.txt").touch() + (included_dir / "included_file.txt").touch() + + # Modify directory permissions to simulate lack of access + (included_dir / "excluded_file.txt").chmod(0o000) + excluded_dir.chmod(0o000) + + scan_policy = ScanPoliciesManager( + exclude_dir_regexes=[r"excluded_dir$"], exclude_file_regexes=[r'.*excluded_file.txt'] + ) + reporter = ProgressReport(sys.stdout, False) + + folder = LocalFolder(str(tmp_path)) + local_paths = folder.all_files(reporter=reporter, policies_manager=scan_policy) + absolute_paths = [path.absolute_path for path in local_paths] + + # Restore directory permissions to clean up + (included_dir / "excluded_file.txt").chmod(0o755) + excluded_dir.chmod(0o755) + + # Check that only included_dir/included_file.txt was return + assert any('included_file.txt' in path for path in absolute_paths) + + # Check that no access warnings are issued for the excluded directory/file + assert not any( + re.match( + r"WARNING: .*excluded_.* could not be accessed \(no permissions to read\?\)", + warning, + ) for warning in reporter.warnings + ), "Access warning was issued for the excluded directory/file" + + reporter.close() diff --git a/test/unit/v0/test_sync.py b/test/unit/v0/test_sync.py index 273fccdf..0cc7009f 100644 --- a/test/unit/v0/test_sync.py +++ b/test/unit/v0/test_sync.py @@ -66,10 +66,10 @@ class TestFolder(TestSync): NAMES = [ '.dot_file', - 'hello.', os.path.join('hello', 'a', '1'), os.path.join('hello', 'a', '2'), os.path.join('hello', 'b'), + 'hello.', 'hello0', os.path.join('inner', 'a.bin'), os.path.join('inner', 'a.txt'), @@ -106,10 +106,10 @@ def assert_filtered_files(self, scan_results, expected_scan_results): def test_exclusions(self): expected_list = [ '.dot_file', - 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', + 'hello.', 'hello0', 'inner/a.txt', 'inner/b.txt', @@ -129,10 +129,10 @@ def test_exclude_all(self): def test_exclusions_inclusions(self): expected_list = [ '.dot_file', - 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', + 'hello.', 'hello0', 'inner/a.bin', 'inner/a.txt', @@ -151,8 +151,8 @@ def test_exclusions_inclusions(self): def test_exclude_matches_prefix(self): expected_list = [ '.dot_file', - 'hello.', 'hello/b', + 'hello.', 'hello0', 'inner/b.bin', 'inner/b.txt', @@ -204,10 +204,10 @@ def test_exclude_directory_trailing_slash_does_not_match(self): def test_exclusion_with_exact_match(self): expected_list = [ '.dot_file', - 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', + 'hello.', 'inner/a.bin', 'inner/a.txt', 'inner/b.bin', diff --git a/test/unit/v1/test_sync.py b/test/unit/v1/test_sync.py index 444f6b3f..9192de0f 100644 --- a/test/unit/v1/test_sync.py +++ b/test/unit/v1/test_sync.py @@ -69,10 +69,10 @@ class TestFolder(TestSync): NAMES = [ '.dot_file', - 'hello.', os.path.join('hello', 'a', '1'), os.path.join('hello', 'a', '2'), os.path.join('hello', 'b'), + 'hello.', 'hello0', os.path.join('inner', 'a.bin'), os.path.join('inner', 'a.txt'), @@ -109,10 +109,10 @@ def assert_filtered_files(self, scan_results, expected_scan_results): def test_exclusions(self): expected_list = [ '.dot_file', - 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', + 'hello.', 'hello0', 'inner/a.txt', 'inner/b.txt', @@ -132,10 +132,10 @@ def test_exclude_all(self): def test_exclusions_inclusions(self): expected_list = [ '.dot_file', - 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', + 'hello.', 'hello0', 'inner/a.bin', 'inner/a.txt', @@ -154,8 +154,8 @@ def test_exclusions_inclusions(self): def test_exclude_matches_prefix(self): expected_list = [ '.dot_file', - 'hello.', 'hello/b', + 'hello.', 'hello0', 'inner/b.bin', 'inner/b.txt', @@ -207,10 +207,10 @@ def test_exclude_directory_trailing_slash_does_not_match(self): def test_exclusion_with_exact_match(self): expected_list = [ '.dot_file', - 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', + 'hello.', 'inner/a.bin', 'inner/a.txt', 'inner/b.bin',