Skip to content
This repository has been archived by the owner on Jun 3, 2024. It is now read-only.

Refactor sitemap generation code #264

Merged
merged 10 commits into from
Jul 27, 2023
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ ADD \

ENV CLASSPATH=${CLASSPATH}:/usr/lib/jvm/java-11-openjdk/saxon/saxon.jar

# Pinned for build issue: https://github.com/pyproj4/pyproj/issues/1321
RUN pip install --upgrade pip
# RUN python3 -m pip install 'cython<3'
# RUN python3 -m pip install --no-use-pep517 pyproj==3.4.1
RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@main

COPY . $APP_DIR/

Expand Down
129 changes: 67 additions & 62 deletions ckanext/geodatagov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,9 @@ def datagovs3():


class Sitemap:
"""Sitemap object

Accepts file_num, start, page_size
"""

def __init__(self, file_num: str, start: int, page_size: int) -> None:
self.file_num = file_num
self.filename_s3 = f"sitemap/sitemap-{file_num}.xml"
self.start = start
self.page_size = page_size
self.xml = ""
Expand All @@ -64,9 +59,22 @@ def write_xml(self, some_xml, add_newline=True) -> None:
else:
self.xml += some_xml

def write_sitemap_header(self) -> None:
def to_json(self) -> str:
return json.dumps(self, default=lambda o: o.__dict__)

def write_sitemap_header(self, index=False) -> None:
self.write_xml('<?xml version="1.0" encoding="UTF-8"?>')
self.write_xml('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
if index:
self.write_xml('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
else:
self.write_xml('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')


class SitemapData(Sitemap):

def __init__(self, file_num: str, start: int, page_size: int) -> None:
super().__init__(file_num, start, page_size)
self.filename_s3 = f"sitemap/sitemap-{file_num}.xml"

def write_pkgs(self, package_query: GeoPackageSearchQuery) -> None:

Expand All @@ -86,8 +94,26 @@ def write_pkgs(self, package_query: GeoPackageSearchQuery) -> None:
def write_sitemap_footer(self) -> None:
self.write_xml("</urlset>")

def to_json(self) -> str:
return json.dumps(self, default=lambda o: o.__dict__)

class SitemapIndex(Sitemap):

def __init__(self, file_num: str, start: int, page_size: int) -> None:
super().__init__(file_num, start, page_size)
self.filename_s3 = "sitemap.xml"

def write_table_of_contents(self, number_of_sitemaps):
current_time = datetime.datetime.now().strftime("%Y-%m-%d")

log.info("Creating sitemap index...")

for file_num in range(number_of_sitemaps):
# add sitemaps to sitemap index file
self.write_xml("<sitemap>")
loc = f"{config.get('ckan.site_url')}/sitemap/sitemap-{file_num}.xml"
self.write_xml(f"<loc>{loc}</loc>")
self.write_xml(f"<lastmod>{current_time}</lastmod>")
self.write_xml("</sitemap>")
self.write_xml("</sitemapindex>")


def get_s3() -> None:
Expand Down Expand Up @@ -174,49 +200,20 @@ def upload_to_key(upload_str: str, filename_on_s3: str) -> None:
else:
log.error(f"File {filename_on_s3} upload failed. Error: {resp_metadata}")

del temp_file

def upload_sitemap_index(sitemaps: list) -> None:
"""Creates and uploads sitemap index xml file"""

current_time = datetime.datetime.now().strftime("%Y-%m-%d")
sitemap_index = Sitemap("index", 0, 0)
sitemap_index.filename_s3 = "sitemap.xml"

log.info("Creating sitemap index...")
# write sitemap index
sitemap_index.write_xml('<?xml version="1.0" encoding="UTF-8"?>')
sitemap_index.write_xml(
'<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
)

for sitemap in sitemaps:
# add sitemaps to sitemap index file
sitemap_index.write_xml("<sitemap>")
loc = f"{CKAN_SITE_URL}/{sitemap.filename_s3}"
sitemap_index.write_xml(f"<loc>{loc}</loc>")
sitemap_index.write_xml(f"<lastmod>{current_time}</lastmod>")
sitemap_index.write_xml("</sitemap>")
sitemap_index.write_xml("</sitemapindex>")
def upload_sitemap_file(sitemap: list) -> None:
"""Handles uploading sitemap files to s3"""

upload_to_key(sitemap_index.xml, sitemap_index.filename_s3)
log.info("Uploading sitemap file...")
upload_to_key(sitemap.xml, sitemap.filename_s3)
log.info(
f"Sitemap index upload complete to: \
{S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap_index.filename_s3}"
f"Sitemap file {sitemap.filename_s3} upload complete to: \
{S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap.filename_s3}"
)


def upload_sitemap_files(sitemaps: list) -> None:
"""Handles uploading sitemap files to s3"""

log.info(f"Uploading {len(sitemaps)} sitemap files...")
for sitemap in sitemaps:
upload_to_key(sitemap.xml, sitemap.filename_s3)
log.info(
f"Sitemap file {sitemap.filename_s3} upload complete to: \
{S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap.filename_s3}"
)


@geodatagov.command()
@click.option("--upload_to_s3", default=UPLOAD_TO_S3, type=click.BOOL)
@click.option("--page_size", default=PAGE_SIZE, type=click.INT)
Expand All @@ -233,12 +230,25 @@ def sitemap_to_s3(upload_to_s3: bool, page_size: int, max_per_page: int):
return

start = 0
file_num = 1
sitemaps = []

paginations = (count // page_size) + 1
for _ in range(paginations):
sitemap = Sitemap(str(file_num), start, page_size)
num_of_pages = (count // page_size) + 1

# Create + Upload Sitemap Index File
sitemap_index = SitemapIndex("index", 0, 0)
sitemap_index.write_sitemap_header(index=True)
sitemap_index.write_table_of_contents(num_of_pages)

if upload_to_s3:
# set global S3 object and vars
get_s3()
upload_to_key(sitemap_index.xml, sitemap_index.filename_s3)
log.info(
f"Sitemap index upload complete to: \
{S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap_index.filename_s3}"
)

for file_num in range(1, num_of_pages + 1):
sitemap = SitemapData(str(file_num), start, page_size)
sitemap.write_sitemap_header()
sitemap.write_pkgs(package_query)
sitemap.write_sitemap_footer()
Expand All @@ -253,22 +263,17 @@ def sitemap_to_s3(upload_to_s3: bool, page_size: int, max_per_page: int):
# 597610699434bde9415a48ed0b1085bfa0e9720f/ckanext/geodatagov/cli.py#L183

log.info(f"done with {sitemap.filename_s3}.")
sitemaps.append(sitemap)

start += page_size
file_num += 1

if upload_to_s3:
log.info("Starting S3 uploads...")
# set global S3 object and vars
get_s3()
if upload_to_s3:
log.info(f"Uploading {sitemap.filename_s3}...")
upload_sitemap_file(sitemap)
else:
log.info(f"Skip upload and return local copy of sitemap {file_num}.")
print(json.dumps(sitemap.to_json(), indent=4))

upload_sitemap_index(sitemaps)
upload_sitemap_files(sitemaps)
else:
log.info("Skip upload and finish.")
dump = [sitemap.to_json() for sitemap in sitemaps]
print(f"Done locally: Sitemap list\n{json.dumps(dump, indent=4)}")
del sitemap


def _normalize_type(_type):
Expand Down
14 changes: 4 additions & 10 deletions ckanext/geodatagov/tests/test_sitemap_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,14 @@ def test_cli_output(cli_result: Result) -> None:
# the example output I have only has one element in it,
# this and _handle_cli_output will need to be updated for examples with more elements
# checks only one list element in output string
assert cli_result.output.count("[") == 1
assert cli_result.output.count("]") == 1
assert cli_result.output.count("file_num") == 1

@staticmethod
def _handle_cli_output(cli_result: Result) -> list:
"""Parses cli output Result to an interable file_list"""

file_list = [
eval(
cli_result.output[
cli_result.output.index("[") + 1: cli_result.output.index("]") - 1
].strip()
)
]
file_list = cli_result.output.split("}\"\n")
file_list = list(set([f + "}\"" for f in file_list]) - {'}\"'})

return file_list

Expand All @@ -79,7 +73,7 @@ def test_create_sitemap(self, cli_result):
datasets = 0
for site_file in file_list:
# site_file is dumped as string
site_file = eval(site_file)
site_file = eval(eval(site_file))

files += 1
""" expected something like
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ pyOpenSSL>22.10 #pinning to fix error with crypto (https://levelup.gitconnected.
# ckantoolkit # included as dep of ckanext-harvest
GeoAlchemy2==0.5.0
Shapely>=1.2.13
pyproj==3.4.1
OWSLib==0.28.1
lxml>=2.3
argparse
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name="ckanext-geodatagov",
version="0.2.0",
version="0.2.1",
description="",
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down