Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perform exclusions before accessing file/dir #501

Merged
merged 19 commits into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 32 additions & 58 deletions b2sdk/_internal/scan/folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pathlib import Path
from typing import Iterator

from ..utils import fix_windows_path_limit, get_file_mtime, is_file_readable, validate_b2_file_name
from ..utils import fix_windows_path_limit, get_file_mtime, validate_b2_file_name
from .exception import (
EmptyDirectory,
EnvironmentEncodingError,
Expand Down Expand Up @@ -219,90 +219,64 @@ def _walk_relative_paths(
# a0.txt
#
# This is because in Unicode '.' comes before '/', which comes before '0'.
names = [] # list of (name, local_path, relative_file_path)

visited_symlinks = visited_symlinks or set()

if local_dir.is_symlink():
real_path = local_dir.resolve()
inode_number = real_path.stat().st_ino

visited_symlinks_count = len(visited_symlinks)

# Add symlink to visited_symlinks to prevent infinite symlink loops
visited_symlinks.add(inode_number)

# Check if set size has changed, if not, symlink has already been visited
if len(visited_symlinks) == visited_symlinks_count:
# Infinite symlink loop detected, report warning and skip symlink
if reporter is not None:
inode_number = local_dir.resolve().stat().st_ino
if inode_number in visited_symlinks:
if reporter:
reporter.circular_symlink_skipped(str(local_dir))
return

return # Skip if symlink already visited
visited_symlinks.add(inode_number)

for local_path in local_dir.iterdir():
for local_path in sorted(local_dir.iterdir()):
name = local_path.name
relative_file_path = join_b2_path(relative_dir_path, name)

if policies_manager.exclude_all_symlinks and local_path.is_symlink():
if reporter is not None:
reporter.symlink_skipped(str(local_path))
continue
try:
validate_b2_file_name(name)
except ValueError as e:
if reporter is not None:
reporter.invalid_name(str(local_path), str(e))
continue

# Skip broken symlinks or other inaccessible files
if not is_file_readable(str(local_path), reporter):
continue

if policies_manager.exclude_all_symlinks and local_path.is_symlink():
if reporter is not None:
reporter.symlink_skipped(str(local_path))
continue

if local_path.is_dir():
name += '/'
if policies_manager.should_exclude_local_directory(str(relative_file_path)):
continue

# remove the leading './' from the relative path to ensure backward compatibility
relative_file_path_str = str(relative_file_path)
if relative_file_path_str.startswith("./"):
relative_file_path_str = relative_file_path_str[2:]
names.append((name, local_path, relative_file_path_str))

# Yield all of the answers.
#
# Sorting the list of triples puts them in the right order because 'name',
# the sort key, is the first thing in the triple.
for (name, local_path, relative_file_path) in sorted(names):
if name.endswith('/'):
continue # Skip excluded directories
# Recurse into directories
yield from self._walk_relative_paths(
local_path,
relative_file_path,
reporter,
policies_manager,
visited_symlinks,
local_path, relative_file_path, reporter, policies_manager, visited_symlinks
)
else:
# Check that the file still exists and is accessible, since it can take a long time
# to iterate through large folders
if is_file_readable(str(local_path), reporter):
if policies_manager.should_exclude_relative_path(relative_file_path):
continue # Skip excluded files
try:
file_mod_time = get_file_mtime(str(local_path))
file_size = local_path.stat().st_size
except OSError:
if reporter is not None:
reporter.local_access_error(str(local_path))
continue

local_scan_path = LocalPath(
absolute_path=self.make_full_path(str(relative_file_path)),
relative_path=str(relative_file_path),
mod_time=file_mod_time,
size=file_size,
)
local_scan_path = LocalPath(
absolute_path=self.make_full_path(str(relative_file_path)),
relative_path=str(relative_file_path),
mod_time=file_mod_time,
size=file_size
)
if policies_manager.should_exclude_local_path(local_scan_path):
continue # Skip excluded files

if policies_manager.should_exclude_local_path(local_scan_path):
if not os.access(local_path, os.R_OK):
if reporter is not None:
reporter.local_permission_error(str(local_path))
continue

yield local_scan_path
yield local_scan_path

@classmethod
def _handle_non_unicode_file_name(cls, name):
Expand Down
6 changes: 3 additions & 3 deletions b2sdk/_internal/scan/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def __init__(
exclude_uploaded_before, exclude_uploaded_after
)

def _should_exclude_relative_path(self, relative_path: str):
def should_exclude_relative_path(self, relative_path: str):
if self._include_file_set.matches(relative_path):
return False
return self._exclude_file_set.matches(relative_path)
Expand All @@ -197,7 +197,7 @@ def should_exclude_local_path(self, local_path: LocalPath):
"""
if local_path.mod_time not in self._include_mod_time_range:
return True
return self._should_exclude_relative_path(local_path.relative_path)
return self.should_exclude_relative_path(local_path.relative_path)

def should_exclude_b2_file_version(self, file_version: FileVersion, relative_path: str):
"""
Expand All @@ -209,7 +209,7 @@ def should_exclude_b2_file_version(self, file_version: FileVersion, relative_pat
return True
if file_version.mod_time_millis not in self._include_mod_time_range:
return True
return self._should_exclude_relative_path(relative_path)
return self.should_exclude_relative_path(relative_path)

def should_exclude_b2_directory(self, dir_path: str):
"""
Expand Down
20 changes: 0 additions & 20 deletions b2sdk/_internal/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,26 +252,6 @@ def validate_b2_file_name(name):
raise ValueError("file names segments (between '/') can be at most 250 utf-8 bytes")


def is_file_readable(local_path, reporter=None):
"""
Check if the local file has read permissions.

:param local_path: a file path
:type local_path: str
:param reporter: reporter object to put errors on
:rtype: bool
"""
if not os.path.exists(local_path):
if reporter is not None:
reporter.local_access_error(local_path)
return False
elif not os.access(local_path, os.R_OK):
if reporter is not None:
reporter.local_permission_error(local_path)
return False
return True


def get_file_mtime(local_path):
"""
Get modification time of a file in milliseconds.
Expand Down
3 changes: 3 additions & 0 deletions b2sdk/v1/sync/scan_policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ def __init__(self, scan_policies_manager: ScanPoliciesManager):
def __repr__(self):
return f"{self.__class__.__name__}({self.scan_policies_manager})"

def should_exclude_relative_path(self, relative_path: str):
self.scan_policies_manager.should_exclude_file(relative_path)

def should_exclude_local_path(self, local_path: v2.LocalSyncPath):
if self.scan_policies_manager.should_exclude_file_version(
_translate_local_path_to_file(local_path).latest_version()
Expand Down
1 change: 1 addition & 0 deletions changelog.d/456.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Move scan filters before a read on filesystem access attempt. This will prevent unnecessary warnings and IO operations on paths that are not relevant to the operation.
70 changes: 69 additions & 1 deletion test/unit/scan/test_folder_traversal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import platform
import re
import sys
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch

import pytest

Expand Down Expand Up @@ -654,3 +654,71 @@ def test_folder_all_files__dir_excluded_by_regex(self, tmp_path):
assert absolute_paths == [
fix_windows_path_limit(str(d1_dir / "file1.txt")),
]

def test_excluded_no_access_check(self, tmp_path):
"""Test that a directory/file is not checked for access if it is excluded."""
# Create directories and files
excluded_dir = tmp_path / "excluded_dir"
excluded_dir.mkdir()
excluded_file = excluded_dir / "excluded_file.txt"
excluded_file.touch()
included_dir = tmp_path / "included_dir"
included_dir.mkdir()
(included_dir / "excluded_file.txt").touch()

# Setup exclusion regex that matches the excluded directory/file name
scan_policy = ScanPoliciesManager(
exclude_dir_regexes=[r"excluded_dir$"], exclude_file_regexes=[r'.*excluded_file.txt']
)
reporter = ProgressReport(sys.stdout, False)

# Patch os.access to monitor if it is called on the excluded file
with patch('os.access', MagicMock(return_value=True)) as mocked_access:
folder = LocalFolder(str(tmp_path))
list(folder.all_files(reporter=reporter, policies_manager=scan_policy))

# Verify os.access was not called for the excluded directory/file
mocked_access.assert_not_called()

reporter.close()

def test_excluded_without_permissions(self, tmp_path):
"""Test that a excluded directory/file without permissions is not processed and no warning is issued."""
excluded_dir = tmp_path / "excluded_dir"
excluded_dir.mkdir()
(excluded_dir / "file.txt").touch()

included_dir = tmp_path / "included_dir"
included_dir.mkdir()
(included_dir / "excluded_file.txt").touch()
(included_dir / "included_file.txt").touch()

# Modify directory permissions to simulate lack of access
(included_dir / "excluded_file.txt").chmod(0o000)
excluded_dir.chmod(0o000)

scan_policy = ScanPoliciesManager(
exclude_dir_regexes=[r"excluded_dir$"], exclude_file_regexes=[r'.*excluded_file.txt']
)
reporter = ProgressReport(sys.stdout, False)

folder = LocalFolder(str(tmp_path))
local_paths = folder.all_files(reporter=reporter, policies_manager=scan_policy)
absolute_paths = [path.absolute_path for path in local_paths]

# Restore directory permissions to clean up
(included_dir / "excluded_file.txt").chmod(0o755)
excluded_dir.chmod(0o755)

# Check that only included_dir/included_file.txt was return
assert any('included_file.txt' in path for path in absolute_paths)

# Check that no access warnings are issued for the excluded directory/file
assert not any(
re.match(
r"WARNING: .*excluded_.* could not be accessed \(no permissions to read\?\)",
warning,
) for warning in reporter.warnings
), "Access warning was issued for the excluded directory/file"

reporter.close()
10 changes: 5 additions & 5 deletions test/unit/v0/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ class TestFolder(TestSync):

NAMES = [
'.dot_file',
'hello.',
os.path.join('hello', 'a', '1'),
os.path.join('hello', 'a', '2'),
os.path.join('hello', 'b'),
'hello.',
'hello0',
os.path.join('inner', 'a.bin'),
os.path.join('inner', 'a.txt'),
Expand Down Expand Up @@ -106,10 +106,10 @@ def assert_filtered_files(self, scan_results, expected_scan_results):
def test_exclusions(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'hello0',
'inner/a.txt',
'inner/b.txt',
Expand All @@ -129,10 +129,10 @@ def test_exclude_all(self):
def test_exclusions_inclusions(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'hello0',
'inner/a.bin',
'inner/a.txt',
Expand All @@ -151,8 +151,8 @@ def test_exclusions_inclusions(self):
def test_exclude_matches_prefix(self):
expected_list = [
'.dot_file',
'hello.',
'hello/b',
'hello.',
'hello0',
'inner/b.bin',
'inner/b.txt',
Expand Down Expand Up @@ -204,10 +204,10 @@ def test_exclude_directory_trailing_slash_does_not_match(self):
def test_exclusion_with_exact_match(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'inner/a.bin',
'inner/a.txt',
'inner/b.bin',
Expand Down
10 changes: 5 additions & 5 deletions test/unit/v1/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ class TestFolder(TestSync):

NAMES = [
'.dot_file',
'hello.',
os.path.join('hello', 'a', '1'),
os.path.join('hello', 'a', '2'),
os.path.join('hello', 'b'),
'hello.',
'hello0',
os.path.join('inner', 'a.bin'),
os.path.join('inner', 'a.txt'),
Expand Down Expand Up @@ -109,10 +109,10 @@ def assert_filtered_files(self, scan_results, expected_scan_results):
def test_exclusions(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'hello0',
'inner/a.txt',
'inner/b.txt',
Expand All @@ -132,10 +132,10 @@ def test_exclude_all(self):
def test_exclusions_inclusions(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'hello0',
'inner/a.bin',
'inner/a.txt',
Expand All @@ -154,8 +154,8 @@ def test_exclusions_inclusions(self):
def test_exclude_matches_prefix(self):
expected_list = [
'.dot_file',
'hello.',
'hello/b',
'hello.',
'hello0',
'inner/b.bin',
'inner/b.txt',
Expand Down Expand Up @@ -207,10 +207,10 @@ def test_exclude_directory_trailing_slash_does_not_match(self):
def test_exclusion_with_exact_match(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'inner/a.bin',
'inner/a.txt',
'inner/b.bin',
Expand Down
Loading