Skip to content

Commit

Permalink
check for non html files from wget
Browse files Browse the repository at this point in the history
  • Loading branch information
pirate committed Jan 25, 2021
1 parent c6f0b8e commit 9764a8e
Showing 1 changed file with 11 additions and 0 deletions.
11 changes: 11 additions & 0 deletions archivebox/extractors/wget.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,22 @@ def wget_output_path(link: Link) -> Optional[str]:
if html_files:
return str(html_files[0].relative_to(link.link_dir))

# sometimes wget'd URLs have no ext and return non-html
# e.g. /some/example/rss/all -> some RSS XML content)
# /some/other/url.o4g -> some binary unrecognized ext)
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in os.listdir(search_dir):
if file_present == last_part_of_url:
return os.path.join(path_from_link_dir, file_present)

# Move up one directory level
search_dir = search_dir.parent

if str(search_dir) == link.link_dir:
break



search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
if not search_dir.is_dir():
Expand Down

0 comments on commit 9764a8e

Please sign in to comment.