Skip to content

Commit

Permalink
Refactor should_save_extractor methods to accept overwrite parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
thedanbob committed Jan 21, 2021
1 parent 553c3ca commit 5420903
Show file tree
Hide file tree
Showing 14 changed files with 56 additions and 50 deletions.
2 changes: 1 addition & 1 deletion archivebox/extractors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
if method_name not in link.history:
link.history[method_name] = []

if should_run(link, out_dir) or overwrite:
if should_run(link, out_dir, overwrite):
log_archive_method_started(method_name)

result = method_function(link=link, out_dir=out_dir)
Expand Down
6 changes: 3 additions & 3 deletions archivebox/extractors/archive_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@


@enforce_types
def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(link.link_dir)
def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

if (out_dir / "archive.org.txt").exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'archive.org.txt').exists():
# if open(path, 'r').read().strip() != 'None':
return False

Expand Down
10 changes: 5 additions & 5 deletions archivebox/extractors/dom.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@


@enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(link.link_dir)
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

if (out_dir / 'output.html').exists():

out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'output.html').exists():
return False

return SAVE_DOM

@enforce_types
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
Expand Down
8 changes: 4 additions & 4 deletions archivebox/extractors/favicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@


@enforce_types
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if (Path(out_dir) / 'favicon.ico').exists():
def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'favicon.ico').exists():
return False

return SAVE_FAVICON

@enforce_types
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""
Expand Down
6 changes: 3 additions & 3 deletions archivebox/extractors/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@


@enforce_types
def should_save_git(link: Link, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or link.link_dir
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

if (out_dir / "git").exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'git').exists():
return False

is_clonable_url = (
Expand Down
9 changes: 5 additions & 4 deletions archivebox/extractors/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@
from ..logging_util import TimedProgress

@enforce_types
def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'headers.json').exists():
return False

output = Path(out_dir or link.link_dir) / 'headers.json'
return not output.exists() and SAVE_HEADERS
return SAVE_HEADERS


@enforce_types
Expand Down
7 changes: 3 additions & 4 deletions archivebox/extractors/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,12 @@


@enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or link.link_dir

def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

if (out_dir / "media").exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'media').exists():
return False

return SAVE_MEDIA
Expand Down
10 changes: 6 additions & 4 deletions archivebox/extractors/mercury.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,15 @@ def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> Archi


@enforce_types
def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

output = Path(out_dir or link.link_dir) / 'mercury'
return SAVE_MERCURY and MERCURY_VERSION and (not output.exists())
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'mercury').exists():
return False

return SAVE_MERCURY


@enforce_types
Expand Down
8 changes: 4 additions & 4 deletions archivebox/extractors/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@


@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(link.link_dir)
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

if (out_dir / "output.pdf").exists():

out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'output.pdf').exists():
return False

return SAVE_PDF
Expand Down
10 changes: 6 additions & 4 deletions archivebox/extractors/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@ def get_html(link: Link, path: Path) -> str:
return document

@enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

output = Path(out_dir or link.link_dir) / 'readability'
return SAVE_READABILITY and READABILITY_VERSION and (not output.exists())
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'readability').exists():
return False

return SAVE_READABILITY


@enforce_types
Expand Down
8 changes: 4 additions & 4 deletions archivebox/extractors/screenshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@


@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(link.link_dir)
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

if (out_dir / "screenshot.png").exists():

out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'screenshot.png').exists():
return False

return SAVE_SCREENSHOT
Expand Down
10 changes: 6 additions & 4 deletions archivebox/extractors/singlefile.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@


@enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(link.link_dir)
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

output = out_dir / 'singlefile.html'
return SAVE_SINGLEFILE and SINGLEFILE_VERSION and (not output.exists())
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'singlefile.html').exists():
return False

return SAVE_SINGLEFILE


@enforce_types
Expand Down
8 changes: 4 additions & 4 deletions archivebox/extractors/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ def handle_endtag(self, tag):


@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'):
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False

if is_static_file(link.url):
# if link already has valid title, skip it
if not overwrite and link.title and not link.title.lower().startswith('http'):
return False

return SAVE_TITLE
Expand Down
4 changes: 2 additions & 2 deletions archivebox/extractors/wget.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@


@enforce_types
def should_save_wget(link: Link, out_dir: Optional[Path]=None) -> bool:
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
output_path = wget_output_path(link)
out_dir = out_dir or Path(link.link_dir)
if output_path and (out_dir / output_path).exists():
if not overwrite and output_path and (out_dir / output_path).exists():
return False

return SAVE_WGET
Expand Down

0 comments on commit 5420903

Please sign in to comment.