diff --git a/discovery-provider/integration_tests/queries/test_get_sitemap.py b/discovery-provider/integration_tests/queries/test_get_sitemap.py
new file mode 100644
index 00000000000..61fa8aec8a6
--- /dev/null
+++ b/discovery-provider/integration_tests/queries/test_get_sitemap.py
@@ -0,0 +1,122 @@
+import logging
+from unittest import mock
+
+from integration_tests.utils import populate_mock_db
+from src.queries.get_sitemap import (
+ build_default,
+ get_playlist_page,
+ get_playlist_root,
+ get_track_page,
+ get_track_root,
+ get_user_page,
+ get_user_root,
+)
+from src.utils.db_session import get_db
+
+logger = logging.getLogger(__name__)
+
+
+@mock.patch("src.queries.get_sitemap.get_client_base_url")
+@mock.patch("src.queries.get_sitemap.set_base_url")
+def test_get_sitemaps(mock_set_base_url, mock_get_client_base_url, app):
+ """Tests that get sitemap works"""
+ with app.app_context():
+ db = get_db()
+
+ mock_set_base_url.return_value = "https://discoveryprovider.audius.co"
+ mock_get_client_base_url.return_value = "https://discoveryprovider.audius.co"
+
+ test_entities = {
+ "playlists": [
+ {
+ "playlist_id": i,
+ "playlist_owner_id": i,
+ "playlist_name": f"p_name_{i}",
+ "is_album": i % 2 == 0,
+ }
+ for i in range(10)
+ ],
+ "tracks": [{"track_id": i, "owner_id": i} for i in range(10)],
+ "track_routes": [
+ {
+ "track_id": i,
+ "owner_id": i,
+ "slug": f"slug_{i}",
+ "title_slug": f"title_slug_{i}",
+ }
+ for i in range(10)
+ ],
+ "users": [{"user_id": i, "handle": f"user_{i}"} for i in range(20)],
+ }
+
+ populate_mock_db(db, test_entities)
+
+ with db.scoped_session() as session:
+ default_sitemap = build_default()
+ assert (
+ default_sitemap
+ == b'\n \n https://discoveryprovider.audius.co/legal/privacy-policy\n \n \n https://discoveryprovider.audius.co/legal/terms-of-use\n \n \n https://discoveryprovider.audius.co/download\n \n \n https://discoveryprovider.audius.co/feed\n \n \n https://discoveryprovider.audius.co/trending\n \n \n https://discoveryprovider.audius.co/explore\n \n \n https://discoveryprovider.audius.co/explore/playlists\n \n \n https://discoveryprovider.audius.co/explore/underground\n \n \n https://discoveryprovider.audius.co/explore/top-albums\n \n \n https://discoveryprovider.audius.co/explore/remixables\n \n \n https://discoveryprovider.audius.co/explore/feeling-lucky\n \n \n https://discoveryprovider.audius.co/explore/chill\n \n \n https://discoveryprovider.audius.co/explore/upbeat\n \n \n https://discoveryprovider.audius.co/explore/intense\n \n \n https://discoveryprovider.audius.co/explore/provoking\n \n \n https://discoveryprovider.audius.co/explore/intimate\n \n \n https://discoveryprovider.audius.co/signup\n \n \n https://discoveryprovider.audius.co/signin\n \n\n'
+ )
+
+ # Validate that there are 7 track sitemaps - 10 total track / 3 user per sitemap = 4
+ track_root = get_track_root(session, 3)
+ assert (
+ track_root
+ == b'\n \n https://discoveryprovider.audius.co/sitemaps/track/1.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/track/2.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/track/3.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/track/4.xml\n \n\n'
+ )
+
+ # Validate that there are 6 playlist sitemaps - 10 total playlist / 2 user per sitemap = 5
+ playlist_root = get_playlist_root(session, 2)
+ assert (
+ playlist_root
+ == b'\n \n https://discoveryprovider.audius.co/sitemaps/playlist/1.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/playlist/2.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/playlist/3.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/playlist/4.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/playlist/5.xml\n \n\n'
+ )
+
+ # Validate that there are 2 user sitemaps - 20 total user / 12 user per sitemap = 2
+ user_root = get_user_root(session, 12)
+ assert (
+ user_root
+ == b'\n \n https://discoveryprovider.audius.co/sitemaps/user/1.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/user/2.xml\n \n\n'
+ )
+
+ # Validate that returns 6 track slugs
+ track_page_1 = get_track_page(session, 1, 6)
+ assert (
+ track_page_1
+ == b'\n \n https://discoveryprovider.audius.co/user_0/slug_0\n \n \n https://discoveryprovider.audius.co/user_1/slug_1\n \n \n https://discoveryprovider.audius.co/user_2/slug_2\n \n \n https://discoveryprovider.audius.co/user_3/slug_3\n \n \n https://discoveryprovider.audius.co/user_4/slug_4\n \n \n https://discoveryprovider.audius.co/user_5/slug_5\n \n\n'
+ )
+
+ # Validate that returns remained 4 track slugs and starts at 6
+ track_page_2 = get_track_page(session, 2, 6)
+ assert (
+ track_page_2
+ == b'\n \n https://discoveryprovider.audius.co/user_6/slug_6\n \n \n https://discoveryprovider.audius.co/user_7/slug_7\n \n \n https://discoveryprovider.audius.co/user_8/slug_8\n \n \n https://discoveryprovider.audius.co/user_9/slug_9\n \n\n'
+ )
+
+ # Validate that returns all playlist(total of 10)
+ playlist_page_1 = get_playlist_page(session, 1, 100)
+ assert (
+ playlist_page_1
+ == b'\n \n https://discoveryprovider.audius.co/user_0/album/p_name_0-0\n \n \n https://discoveryprovider.audius.co/user_1/playlist/p_name_1-1\n \n \n https://discoveryprovider.audius.co/user_2/album/p_name_2-2\n \n \n https://discoveryprovider.audius.co/user_3/playlist/p_name_3-3\n \n \n https://discoveryprovider.audius.co/user_4/album/p_name_4-4\n \n \n https://discoveryprovider.audius.co/user_5/playlist/p_name_5-5\n \n \n https://discoveryprovider.audius.co/user_6/album/p_name_6-6\n \n \n https://discoveryprovider.audius.co/user_7/playlist/p_name_7-7\n \n \n https://discoveryprovider.audius.co/user_8/album/p_name_8-8\n \n \n https://discoveryprovider.audius.co/user_9/playlist/p_name_9-9\n \n\n'
+ )
+
+ # Validate that starts at user 0 and 8 user slugs
+ user_page_1 = get_user_page(session, 1, 8)
+ assert (
+ user_page_1
+ == b'\n \n https://discoveryprovider.audius.co/user_0\n \n \n https://discoveryprovider.audius.co/user_1\n \n \n https://discoveryprovider.audius.co/user_2\n \n \n https://discoveryprovider.audius.co/user_3\n \n \n https://discoveryprovider.audius.co/user_4\n \n \n https://discoveryprovider.audius.co/user_5\n \n \n https://discoveryprovider.audius.co/user_6\n \n \n https://discoveryprovider.audius.co/user_7\n \n\n'
+ )
+
+ # Validate that starts at user 8*1=8 and 8 user slugs
+ user_page_2 = get_user_page(session, 2, 8)
+ assert (
+ user_page_2
+ == b'\n \n https://discoveryprovider.audius.co/user_8\n \n \n https://discoveryprovider.audius.co/user_9\n \n \n https://discoveryprovider.audius.co/user_10\n \n \n https://discoveryprovider.audius.co/user_11\n \n \n https://discoveryprovider.audius.co/user_12\n \n \n https://discoveryprovider.audius.co/user_13\n \n \n https://discoveryprovider.audius.co/user_14\n \n \n https://discoveryprovider.audius.co/user_15\n \n\n'
+ )
+
+ # Validate that starts at user 8*2=16 and only 4 user slugs (The remainder with 8 max)
+ user_page_3 = get_user_page(session, 3, 8)
+ assert (
+ user_page_3
+ == b'\n \n https://discoveryprovider.audius.co/user_16\n \n \n https://discoveryprovider.audius.co/user_17\n \n \n https://discoveryprovider.audius.co/user_18\n \n \n https://discoveryprovider.audius.co/user_19\n \n\n'
+ )
diff --git a/discovery-provider/requirements.txt b/discovery-provider/requirements.txt
index 785f7fb316e..d0a8e21b8d9 100644
--- a/discovery-provider/requirements.txt
+++ b/discovery-provider/requirements.txt
@@ -49,6 +49,7 @@ opentelemetry-instrumentation-logging==0.33b0
opentelemetry-sdk==1.12.0
opentelemetry-semantic-conventions==0.33b0
opentelemetry-util-http==0.33b0
+lxml==4.9.1
# Solana support
base58==2.1.0
diff --git a/discovery-provider/src/queries/get_sitemap.py b/discovery-provider/src/queries/get_sitemap.py
new file mode 100644
index 00000000000..6338dddd9a1
--- /dev/null
+++ b/discovery-provider/src/queries/get_sitemap.py
@@ -0,0 +1,224 @@
+import logging
+from typing import List, Tuple
+
+from lxml import etree
+from sqlalchemy import asc, func
+from sqlalchemy.orm.session import Session
+from src.models.playlists.playlist import Playlist
+from src.models.tracks.track import Track
+from src.models.tracks.track_route import TrackRoute
+from src.models.users.user import User
+from src.utils.get_all_other_nodes import get_node_endpoint
+
+logger = logging.getLogger(__name__)
+
+
+root_site_maps_routes = [
+ "defaults.xml",
+ "tracks/index.xml",
+ "collections/index.xml",
+ "users/index.xml",
+]
+
+
+def get_client_base_url():
+ return "https://audius.co"
+
+
+def set_base_url():
+ endpoint = get_node_endpoint()
+ return endpoint
+
+
+def create_client_url(route):
+ client_base = get_client_base_url()
+ return f"{client_base}/{route}"
+
+
+def create_xml_url(route):
+ self_base = set_base_url()
+ return f"{self_base}/{route}"
+
+
+default_routes = [
+ # static
+ "legal/privacy-policy",
+ "legal/terms-of-use",
+ "download",
+ # app
+ "feed",
+ "trending",
+ "explore",
+ "explore/playlists",
+ "explore/underground",
+ "explore/top-albums",
+ "explore/remixables",
+ "explore/feeling-lucky",
+ "explore/chill",
+ "explore/upbeat",
+ "explore/intense",
+ "explore/provoking",
+ "explore/intimate",
+ "signup",
+ "signin",
+]
+
+# The max number of urls that can be in a single sitemap
+LIMIT = 50_000
+
+
+def build_default():
+ root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
+ for site_map_route in default_routes:
+ sitemap_el = etree.Element("sitemap")
+ loc = etree.Element("loc")
+ loc.text = create_client_url(site_map_route)
+ sitemap_el.append(loc)
+ root.append(sitemap_el)
+
+ # pretty string
+ return etree.tostring(root, pretty_print=True)
+
+
+def get_max_track_count(session: Session) -> int:
+ max = (
+ session.query(func.count(Track.track_id))
+ .filter(Track.is_current == True, Track.stem_of == None)
+ .one()
+ )
+ return max[0]
+
+
+def get_max_user_count(session: Session) -> int:
+ max = (
+ session.query(func.count(User.user_id))
+ .filter(User.is_current == True, User.is_deactivated == False)
+ .one()
+ )
+ return max[0]
+
+
+def get_max_playlist_count(session: Session) -> int:
+ max = (
+ session.query(func.count(Playlist.playlist_id))
+ .filter(
+ Playlist.is_current == True,
+ Playlist.is_private == False,
+ Playlist.is_delete == False,
+ )
+ .one()
+ )
+ return max[0]
+
+
+def get_dynamic_root(max: int, base_route: str, limit: int = LIMIT):
+ num_pages = (max // limit) + 1 if max % limit != 0 else int(max / limit)
+ root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
+ for num in range(num_pages):
+ sitemap_el = etree.Element("sitemap")
+ loc = etree.Element("loc")
+ loc.text = create_xml_url(f"sitemaps/{base_route}/{num+1}.xml")
+ sitemap_el.append(loc)
+ root.append(sitemap_el)
+
+ return etree.tostring(root, pretty_print=True)
+
+
+def get_entity_page(slugs: List[str]):
+ root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
+ for slug in slugs:
+ sitemap_url = etree.Element("url")
+ loc = etree.Element("loc")
+ loc.text = create_client_url(slug)
+ sitemap_url.append(loc)
+ root.append(sitemap_url)
+ return etree.tostring(root, pretty_print=True)
+
+
+def get_track_slugs(session: Session, limit: int, offset: int):
+ slugs: List[str] = (
+ session.query(User.handle_lc, TrackRoute.slug)
+ .join(Track, TrackRoute.track_id == Track.track_id)
+ .join(User, TrackRoute.owner_id == User.user_id)
+ .filter(
+ Track.is_current == True, Track.stem_of == None, User.is_current == True
+ )
+ .order_by(asc(Track.track_id))
+ .limit(limit)
+ .offset(offset)
+ .all()
+ )
+
+ return [f"{slug[0]}/{slug[1]}" for slug in slugs]
+
+
+def get_playlist_slugs(session: Session, limit: int, offset: int):
+ playlists: List[Tuple[str, str, int, str]] = (
+ session.query(
+ User.handle_lc,
+ Playlist.playlist_name,
+ Playlist.playlist_id,
+ Playlist.is_album,
+ )
+ .join(User, User.user_id == Playlist.playlist_owner_id)
+ .filter(Playlist.is_current == True, Playlist.is_private == False)
+ .order_by(asc(Playlist.playlist_id))
+ .limit(limit)
+ .offset(offset)
+ .all()
+ )
+ slugs = [
+ f"{p[0]}/{'album' if p[3] else 'playlist'}/{p[1]}-{p[2]}" for p in playlists
+ ]
+
+ return slugs
+
+
+def get_user_slugs(session: Session, limit: int, offset: int):
+ slugs = (
+ session.query(User.handle_lc)
+ .filter(
+ User.is_current == True,
+ User.is_deactivated == False,
+ User.handle_lc != None,
+ )
+ .order_by(asc(User.user_id))
+ .limit(limit)
+ .offset(offset)
+ .all()
+ )
+
+ return [slug[0] for slug in slugs]
+
+
+def get_track_root(session: Session, limit: int = LIMIT):
+ max_track_count = get_max_track_count(session)
+ return get_dynamic_root(max_track_count, "track", limit)
+
+
+def get_playlist_root(session: Session, limit: int = LIMIT):
+ max_track_count = get_max_playlist_count(session)
+ return get_dynamic_root(max_track_count, "playlist", limit)
+
+
+def get_user_root(session: Session, limit: int = LIMIT):
+ max_user_count = get_max_user_count(session)
+ return get_dynamic_root(max_user_count, "user", limit)
+
+
+def get_track_page(session: Session, page: int, limit: int = LIMIT):
+ offset = (page - 1) * limit
+ slugs = get_track_slugs(session, limit, offset)
+ return get_entity_page(slugs)
+
+
+def get_playlist_page(session: Session, page: int, limit: int = LIMIT):
+ offset = (page - 1) * limit
+ slugs = get_playlist_slugs(session, limit, offset)
+ return get_entity_page(slugs)
+
+
+def get_user_page(session: Session, page: int, limit: int = LIMIT):
+ offset = (page - 1) * limit
+ slugs = get_user_slugs(session, limit, offset)
+ return get_entity_page(slugs)
diff --git a/discovery-provider/src/queries/queries.py b/discovery-provider/src/queries/queries.py
index 14407b46641..78ff68024a8 100644
--- a/discovery-provider/src/queries/queries.py
+++ b/discovery-provider/src/queries/queries.py
@@ -1,6 +1,7 @@
import logging # pylint: disable=C0302
+import re
-from flask import Blueprint, request
+from flask import Blueprint, Response, request
from src import api_helpers, exceptions
from src.queries.get_cid_source import get_cid_source
from src.queries.get_feed import get_feed
@@ -24,6 +25,15 @@
from src.queries.get_savers_for_playlist import get_savers_for_playlist
from src.queries.get_savers_for_track import get_savers_for_track
from src.queries.get_saves import get_saves
+from src.queries.get_sitemap import (
+ build_default,
+ get_playlist_page,
+ get_playlist_root,
+ get_track_page,
+ get_track_root,
+ get_user_page,
+ get_user_root,
+)
from src.queries.get_sol_plays import (
get_sol_play,
get_total_aggregate_plays,
@@ -44,6 +54,7 @@
from src.queries.get_users import get_users
from src.queries.get_users_account import get_users_account
from src.queries.query_helpers import get_current_user_id, get_pagination_vars
+from src.utils.db_session import get_db_read_replica
from src.utils.redis_metrics import record_metrics
logger = logging.getLogger(__name__)
@@ -645,3 +656,62 @@ def get_user_history_route(user_id):
return api_helpers.success_response(user_history)
except exceptions.ArgumentError as e:
return api_helpers.error_response(str(e), 400)
+
+
+@bp.route("/sitemaps/default.xml", methods=("GET",))
+def get_base_sitemap():
+ try:
+ default_sitemap = build_default()
+ return Response(default_sitemap, mimetype="text/xml")
+ except exceptions.ArgumentError as e:
+ return api_helpers.error_response(str(e), 400)
+
+
+@bp.route("/sitemaps//index.xml", methods=("GET",))
+def get_type_base_sitemap(type):
+ try:
+ db = get_db_read_replica()
+ with db.scoped_session() as session:
+ xml = ""
+ if type == "playlist":
+ xml = get_playlist_root(session)
+ elif type == "track":
+ xml = get_track_root(session)
+ elif type == "user":
+ xml = get_user_root(session)
+ else:
+ return api_helpers.error_response(
+ f"Invalid sitemap type {type}, should be one of playlist, track, user",
+ 400,
+ )
+ return Response(xml, mimetype="text/xml")
+ except exceptions.ArgumentError as e:
+ return api_helpers.error_response(str(e), 400)
+
+
+@bp.route("/sitemaps//", methods=("GET",))
+def get_type_sitemap_page(type: str, file_name: str):
+ try:
+ number = re.search("(\d+)\.xml$", file_name) # noqa: W605
+ if not number:
+ return api_helpers.error_response(
+ f"Invalid filepath {file_name}, should be of format .xml", 400
+ )
+ page_number = int(number.group(1))
+ db = get_db_read_replica()
+ with db.scoped_session() as session:
+ xml = ""
+ if type == "playlist":
+ xml = get_playlist_page(session, page_number)
+ elif type == "track":
+ xml = get_track_page(session, page_number)
+ elif type == "user":
+ xml = get_user_page(session, page_number)
+ else:
+ return api_helpers.error_response(
+ f"Invalid sitemap type {type}, should be one of playlist, track, user",
+ 400,
+ )
+ return Response(xml, mimetype="text/xml")
+ except exceptions.ArgumentError as e:
+ return api_helpers.error_response(str(e), 400)
diff --git a/discovery-provider/src/utils/get_all_other_nodes.py b/discovery-provider/src/utils/get_all_other_nodes.py
index d67385379fb..2ac34841d7b 100644
--- a/discovery-provider/src/utils/get_all_other_nodes.py
+++ b/discovery-provider/src/utils/get_all_other_nodes.py
@@ -61,7 +61,7 @@ def get_node_endpoint() -> Optional[str]:
try:
node_info = future.result()
wallet = node_info[3]
- if wallet == shared_config["delegate"]["owner_wallet"]:
+ if wallet.lower() == shared_config["delegate"]["owner_wallet"].lower():
endpoint = node_info[1]
break
except Exception as e: