Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filters to Bucket.ls #468

Merged
merged 9 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions b2sdk/_v3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,6 @@
)
from b2sdk.session import B2Session
from b2sdk.utils.thread_pool import ThreadPoolMixin

# filter
from b2sdk.filter import FilterType, Filter
9 changes: 9 additions & 0 deletions b2sdk/bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import logging
import pathlib
from contextlib import suppress
from typing import Sequence

from .encryption.setting import EncryptionSetting, EncryptionSettingFactory
from .encryption.types import EncryptionMode
Expand All @@ -33,6 +34,7 @@
LegalHold,
)
from .file_version import DownloadVersion, FileVersion
from .filter import Filter, FilterMatcher
from .http_constants import LIST_FILE_NAMES_MAX_LIMIT
from .progress import AbstractProgressListener, DoNothingProgressListener
from .raw_api import LifecycleRule
Expand Down Expand Up @@ -369,6 +371,7 @@ def ls(
recursive: bool = False,
fetch_count: int | None = LIST_FILE_NAMES_MAX_LIMIT,
with_wildcard: bool = False,
filters: Sequence[Filter] = (),
):
"""
Pretend that folders exist and yields the information about the files in a folder.
Expand All @@ -390,6 +393,7 @@ def ls(
:param with_wildcard: Accepts "*", "?", "[]" and "[!]" in folder_to_list, similarly to what shell does.
As of 1.19.0 it can only be enabled when recursive is also enabled.
Also, in this mode, folder_to_list is considered to be a filename or a pattern.
:param filters: list of filters to apply to the files returned by the server.
:rtype: generator[tuple[b2sdk.v2.FileVersion, str]]
:returns: generator of (file_version, folder_name) tuples

Expand Down Expand Up @@ -445,6 +449,7 @@ def ls(
# "folder". If the first search doesn't produce enough results,
# then we keep calling list_file_names until we get all of the
# names in this "folder".
filter_matcher = FilterMatcher(filters)
current_dir = None
start_file_name = prefix
start_file_id = None
Expand All @@ -466,6 +471,10 @@ def ls(
):
# File doesn't match our wildcard rules
continue

if not filter_matcher.match(file_version.file_name):
continue

after_prefix = file_version.file_name[len(prefix):]
# In case of wildcards, we don't care about folders at all, and it's recursive by default.
if '/' not in after_prefix or recursive:
Expand Down
67 changes: 67 additions & 0 deletions b2sdk/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
######################################################################
#
# File: b2sdk/filter.py
#
# Copyright 2024 Backblaze Inc. All Rights Reserved.
#
# License https://www.backblaze.com/using_b2_code.html
#
######################################################################
from __future__ import annotations

import fnmatch
from dataclasses import dataclass
from enum import Enum
from typing import Sequence


class FilterType(Enum):
INCLUDE = "include"
EXCLUDE = "exclude"


@dataclass
class Filter:
type: FilterType
pattern: str

@classmethod
def include(cls, pattern: str) -> Filter:
return cls(type=FilterType.INCLUDE, pattern=pattern)

@classmethod
def exclude(cls, pattern: str) -> Filter:
return cls(type=FilterType.EXCLUDE, pattern=pattern)


class FilterMatcher:
"""
Holds a list of filters and matches a string (i.e. file name) against them.

The order of filters matters. The *last* matching filter decides whether
the string is included or excluded. If no filter matches, the string is
included by default.

If the given list of filters contains only INCLUDE filters, then it is
assumed that all files are excluded by default. In this case, an additional
EXCLUDE filter is prepended to the list.

:param filters: list of filters
"""

def __init__(self, filters: Sequence[Filter]):
if filters and all(filter_.type == FilterType.INCLUDE for filter_ in filters):
filters = [Filter(type=FilterType.EXCLUDE, pattern="*"), *filters]

self.filters = filters

def match(self, s: str) -> bool:
include_file = True
for filter_ in self.filters:
matched = fnmatch.fnmatchcase(s, filter_.pattern)
if matched and filter_.type == FilterType.INCLUDE:
include_file = True
elif matched and filter_.type == FilterType.EXCLUDE:
include_file = False

return include_file
1 change: 1 addition & 0 deletions changelog.d/+bucket_ls_filters.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for filters to `Bucket.ls()`.
241 changes: 241 additions & 0 deletions test/unit/bucket/test_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
FakeResponse,
FileRetentionSetting,
FileSimulator,
Filter,
InMemoryCache,
LargeFileUploadState,
LegalHold,
Expand Down Expand Up @@ -788,6 +789,246 @@ def test_matching_exact_filename(self):
]
self.assertEqual(expected, actual)

def test_filters_wildcard_matching(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a')
self.bucket.upload_bytes(data, 'b/1/test-1.txt')
self.bucket.upload_bytes(data, 'b/2/test-2.csv')
self.bucket.upload_bytes(data, 'b/2/test-3.txt')
self.bucket.upload_bytes(data, 'b/3/test-4.jpg')
self.bucket.upload_bytes(data, 'b/3/test-4.txt')
self.bucket.upload_bytes(data, 'b/3/test-5.txt')
expected = [
('b/1/test-1.txt', len(data), 'upload', None),
('b/2/test-3.txt', len(data), 'upload', None),
('b/3/test-4.txt', len(data), 'upload', None),
('b/3/test-5.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
'b/',
recursive=True,
filters=[Filter.include("*.txt")],
)
]
self.assertEqual(expected, actual)

def test_filters_wildcard_matching_including_root(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'b/1/test.csv')
self.bucket.upload_bytes(data, 'b/1/test.txt')
self.bucket.upload_bytes(data, 'b/2/test.tsv')
self.bucket.upload_bytes(data, 'b/2/test.txt')
self.bucket.upload_bytes(data, 'b/3/test.txt')
self.bucket.upload_bytes(data, 'test.txt')
self.bucket.upload_bytes(data, 'test.csv')

expected = [
('b/1/test.txt', len(data), 'upload', None),
('b/2/test.txt', len(data), 'upload', None),
('b/3/test.txt', len(data), 'upload', None),
('test.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder)
for (info, folder) in self.bucket_ls(recursive=True, filters=[Filter.include('*.txt')])
]
self.assertEqual(expected, actual)

expected = [
('b/1/test.csv', len(data), 'upload', None),
('b/2/test.tsv', len(data), 'upload', None),
('test.csv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder)
for (info, folder) in self.bucket_ls(recursive=True, filters=[Filter.exclude('*.txt')])
]
self.assertEqual(expected, actual)

def test_filters_single_character_matching(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a')
self.bucket.upload_bytes(data, 'b/2/test.csv')
self.bucket.upload_bytes(data, 'b/2/test.txt')
self.bucket.upload_bytes(data, 'b/2/test.tsv')

expected = [
('b/2/test.csv', len(data), 'upload', None),
('b/2/test.tsv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/2/test.?sv')],
)
]
self.assertEqual(expected, actual)

expected = [
('a', len(data), 'upload', None),
('b/2/test.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.exclude('b/2/test.?sv')],
)
]
self.assertEqual(expected, actual)

def test_filters_sequence_matching(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a')
self.bucket.upload_bytes(data, 'b/2/test.csv')
self.bucket.upload_bytes(data, 'b/2/test.ksv')
self.bucket.upload_bytes(data, 'b/2/test.tsv')

expected = [
('b/2/test.csv', len(data), 'upload', None),
('b/2/test.tsv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/2/test.[tc]sv')],
)
]
self.assertEqual(expected, actual)

expected = [
('a', len(data), 'upload', None),
('b/2/test.ksv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.exclude('b/2/test.[tc]sv')],
)
]
self.assertEqual(expected, actual)

def test_filters_negative_sequence_matching(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a')
self.bucket.upload_bytes(data, 'b/2/test.csv')
self.bucket.upload_bytes(data, 'b/2/test.ksv')
self.bucket.upload_bytes(data, 'b/2/test.tsv')

expected = [
('b/2/test.tsv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/2/test.[!ck]sv')],
)
]
self.assertEqual(expected, actual)

expected = [
('a', len(data), 'upload', None),
('b/2/test.csv', len(data), 'upload', None),
('b/2/test.ksv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.exclude('b/2/test.[!ck]sv')],
)
]
self.assertEqual(expected, actual)

def test_filters_matching_exact_filename(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'b/a.txt')
self.bucket.upload_bytes(data, 'b/b.txt')

expected = [
('b/a.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/a.txt')],
)
]
self.assertEqual(expected, actual)

expected = [
('b/b.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.exclude('b/a.txt')],
)
]
self.assertEqual(expected, actual)

def test_filters_mixed_with_wildcards(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a.csv')
self.bucket.upload_bytes(data, 'a.txt')
self.bucket.upload_bytes(data, 'b/a-1.csv')
self.bucket.upload_bytes(data, 'b/a-1.txt')
self.bucket.upload_bytes(data, 'b/a-2.csv')
self.bucket.upload_bytes(data, 'b/a-2.txt')
self.bucket.upload_bytes(data, 'b/a-a.csv')
self.bucket.upload_bytes(data, 'b/a-a.txt')
self.bucket.upload_bytes(data, 'b/a.csv')
self.bucket.upload_bytes(data, 'b/a.txt')

expected = [
('a.txt', len(data), 'upload', None),
('b/a-1.txt', len(data), 'upload', None),
('b/a-a.txt', len(data), 'upload', None),
('b/a.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
'*.txt',
recursive=True,
with_wildcard=True,
filters=[Filter.exclude('*-2.txt')],
)
]
self.assertEqual(expected, actual)

expected = [
('b/a-1.csv', len(data), 'upload', None),
('b/a-1.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
'b/?-[1234567890].*',
recursive=True,
with_wildcard=True,
filters=[Filter.exclude('*-2.*')]
)
]
self.assertEqual(expected, actual)

def test_filters_combination(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a.txt')
self.bucket.upload_bytes(data, 'b/a-1.csv')
self.bucket.upload_bytes(data, 'b/a-1.txt')

expected = [
('a.txt', len(data), 'upload', None),
('b/a-1.csv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/*'),
Filter.exclude('*.txt'),
Filter.include('a.txt')],
)
]
self.assertEqual(expected, actual)


class TestGetFreshState(TestCaseWithBucket):
def test_ok(self):
Expand Down
10 changes: 10 additions & 0 deletions test/unit/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
######################################################################
#
# File: test/unit/filter/__init__.py
#
# Copyright 2024 Backblaze Inc. All Rights Reserved.
#
# License https://www.backblaze.com/using_b2_code.html
#
######################################################################
from __future__ import annotations
Loading
Loading