Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion charon.spec
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ Requires: python%{python3_pkgversion}-importlib-metadata
Requires: python%{python3_pkgversion}-zipp
Requires: python%{python3_pkgversion}-attrs
Requires: python%{python3_pkgversion}-pyrsistent
Requires: python%{python3_pkgversion}-beautifulsoup4

%description
Simple Python tool with command line interface for charon init,
Expand Down
30 changes: 18 additions & 12 deletions charon/pkgs/checksum_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"""
from charon.utils.files import digest
from typing import Tuple, List, Dict
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import tempfile
import os
import logging
Expand Down Expand Up @@ -224,24 +224,30 @@ def _list_folder_content(folder_url: str, folder_path: str) -> List[str]:
contentType = r.headers.get('Content-Type')
if contentType and "text/html" in contentType:
pageContent = r.text
return _parseContent(pageContent, folder_path)
p = _IndexParser()
p.feed(pageContent)
return p.get_content(folder_path)
else:
logger.warning("%s is not a folder!", folder_url)
except Exception as e:
logger.error("Can not list folder %s. The error is %s", folder_url, e)
return []


def _parseContent(pageContent: str, parent: str) -> List[str]:
items = []
soup = BeautifulSoup(pageContent, "html.parser")
contents = soup.find("ul", id="contents").find_all("a")
for c in contents:
item = c["href"]
if not item or item.strip() == '../':
continue
items.append(os.path.join(parent, item))
return items
class _IndexParser(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.__content = []

def handle_starttag(self, tag, attrs):
if tag == "a":
for name, link in attrs:
if name == "href" and link.strip() not in ['../', '']:
self.__content.append(link)

def get_content(self, parent):
return [os.path.join(parent, i) for i in self.__content]


def _read_remote_file_content(remote_file_url: str) -> str:
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,4 @@ PyYAML==6.0.1
defusedxml==0.7.1
subresource-integrity==0.2
jsonschema==4.19.0
beautifulsoup4==4.11.1
urllib3==1.26.18