From f873142572472c1f61e06e97b6ecfd35a540c3d2 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Mon, 3 Oct 2022 18:17:54 -0400 Subject: [PATCH] Add Discovery Sitemap Endpoint (#3973) * Add sitemap methods to discovery * Fix sitemap urls and add tests * rename route for sitemap to be more consistent * Fix double set * Fix integration test --- .../queries/test_get_sitemap.py | 122 ++++++++++ discovery-provider/requirements.txt | 1 + discovery-provider/src/queries/get_sitemap.py | 224 ++++++++++++++++++ discovery-provider/src/queries/queries.py | 72 +++++- .../src/utils/get_all_other_nodes.py | 2 +- 5 files changed, 419 insertions(+), 2 deletions(-) create mode 100644 discovery-provider/integration_tests/queries/test_get_sitemap.py create mode 100644 discovery-provider/src/queries/get_sitemap.py diff --git a/discovery-provider/integration_tests/queries/test_get_sitemap.py b/discovery-provider/integration_tests/queries/test_get_sitemap.py new file mode 100644 index 00000000000..61fa8aec8a6 --- /dev/null +++ b/discovery-provider/integration_tests/queries/test_get_sitemap.py @@ -0,0 +1,122 @@ +import logging +from unittest import mock + +from integration_tests.utils import populate_mock_db +from src.queries.get_sitemap import ( + build_default, + get_playlist_page, + get_playlist_root, + get_track_page, + get_track_root, + get_user_page, + get_user_root, +) +from src.utils.db_session import get_db + +logger = logging.getLogger(__name__) + + +@mock.patch("src.queries.get_sitemap.get_client_base_url") +@mock.patch("src.queries.get_sitemap.set_base_url") +def test_get_sitemaps(mock_set_base_url, mock_get_client_base_url, app): + """Tests that get sitemap works""" + with app.app_context(): + db = get_db() + + mock_set_base_url.return_value = "https://discoveryprovider.audius.co" + mock_get_client_base_url.return_value = "https://discoveryprovider.audius.co" + + test_entities = { + "playlists": [ + { + "playlist_id": i, + "playlist_owner_id": i, + "playlist_name": f"p_name_{i}", + "is_album": i % 2 == 0, + } + for i in range(10) + ], + "tracks": [{"track_id": i, "owner_id": i} for i in range(10)], + "track_routes": [ + { + "track_id": i, + "owner_id": i, + "slug": f"slug_{i}", + "title_slug": f"title_slug_{i}", + } + for i in range(10) + ], + "users": [{"user_id": i, "handle": f"user_{i}"} for i in range(20)], + } + + populate_mock_db(db, test_entities) + + with db.scoped_session() as session: + default_sitemap = build_default() + assert ( + default_sitemap + == b'\n \n https://discoveryprovider.audius.co/legal/privacy-policy\n \n \n https://discoveryprovider.audius.co/legal/terms-of-use\n \n \n https://discoveryprovider.audius.co/download\n \n \n https://discoveryprovider.audius.co/feed\n \n \n https://discoveryprovider.audius.co/trending\n \n \n https://discoveryprovider.audius.co/explore\n \n \n https://discoveryprovider.audius.co/explore/playlists\n \n \n https://discoveryprovider.audius.co/explore/underground\n \n \n https://discoveryprovider.audius.co/explore/top-albums\n \n \n https://discoveryprovider.audius.co/explore/remixables\n \n \n https://discoveryprovider.audius.co/explore/feeling-lucky\n \n \n https://discoveryprovider.audius.co/explore/chill\n \n \n https://discoveryprovider.audius.co/explore/upbeat\n \n \n https://discoveryprovider.audius.co/explore/intense\n \n \n https://discoveryprovider.audius.co/explore/provoking\n \n \n https://discoveryprovider.audius.co/explore/intimate\n \n \n https://discoveryprovider.audius.co/signup\n \n \n https://discoveryprovider.audius.co/signin\n \n\n' + ) + + # Validate that there are 7 track sitemaps - 10 total track / 3 user per sitemap = 4 + track_root = get_track_root(session, 3) + assert ( + track_root + == b'\n \n https://discoveryprovider.audius.co/sitemaps/track/1.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/track/2.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/track/3.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/track/4.xml\n \n\n' + ) + + # Validate that there are 6 playlist sitemaps - 10 total playlist / 2 user per sitemap = 5 + playlist_root = get_playlist_root(session, 2) + assert ( + playlist_root + == b'\n \n https://discoveryprovider.audius.co/sitemaps/playlist/1.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/playlist/2.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/playlist/3.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/playlist/4.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/playlist/5.xml\n \n\n' + ) + + # Validate that there are 2 user sitemaps - 20 total user / 12 user per sitemap = 2 + user_root = get_user_root(session, 12) + assert ( + user_root + == b'\n \n https://discoveryprovider.audius.co/sitemaps/user/1.xml\n \n \n https://discoveryprovider.audius.co/sitemaps/user/2.xml\n \n\n' + ) + + # Validate that returns 6 track slugs + track_page_1 = get_track_page(session, 1, 6) + assert ( + track_page_1 + == b'\n \n https://discoveryprovider.audius.co/user_0/slug_0\n \n \n https://discoveryprovider.audius.co/user_1/slug_1\n \n \n https://discoveryprovider.audius.co/user_2/slug_2\n \n \n https://discoveryprovider.audius.co/user_3/slug_3\n \n \n https://discoveryprovider.audius.co/user_4/slug_4\n \n \n https://discoveryprovider.audius.co/user_5/slug_5\n \n\n' + ) + + # Validate that returns remained 4 track slugs and starts at 6 + track_page_2 = get_track_page(session, 2, 6) + assert ( + track_page_2 + == b'\n \n https://discoveryprovider.audius.co/user_6/slug_6\n \n \n https://discoveryprovider.audius.co/user_7/slug_7\n \n \n https://discoveryprovider.audius.co/user_8/slug_8\n \n \n https://discoveryprovider.audius.co/user_9/slug_9\n \n\n' + ) + + # Validate that returns all playlist(total of 10) + playlist_page_1 = get_playlist_page(session, 1, 100) + assert ( + playlist_page_1 + == b'\n \n https://discoveryprovider.audius.co/user_0/album/p_name_0-0\n \n \n https://discoveryprovider.audius.co/user_1/playlist/p_name_1-1\n \n \n https://discoveryprovider.audius.co/user_2/album/p_name_2-2\n \n \n https://discoveryprovider.audius.co/user_3/playlist/p_name_3-3\n \n \n https://discoveryprovider.audius.co/user_4/album/p_name_4-4\n \n \n https://discoveryprovider.audius.co/user_5/playlist/p_name_5-5\n \n \n https://discoveryprovider.audius.co/user_6/album/p_name_6-6\n \n \n https://discoveryprovider.audius.co/user_7/playlist/p_name_7-7\n \n \n https://discoveryprovider.audius.co/user_8/album/p_name_8-8\n \n \n https://discoveryprovider.audius.co/user_9/playlist/p_name_9-9\n \n\n' + ) + + # Validate that starts at user 0 and 8 user slugs + user_page_1 = get_user_page(session, 1, 8) + assert ( + user_page_1 + == b'\n \n https://discoveryprovider.audius.co/user_0\n \n \n https://discoveryprovider.audius.co/user_1\n \n \n https://discoveryprovider.audius.co/user_2\n \n \n https://discoveryprovider.audius.co/user_3\n \n \n https://discoveryprovider.audius.co/user_4\n \n \n https://discoveryprovider.audius.co/user_5\n \n \n https://discoveryprovider.audius.co/user_6\n \n \n https://discoveryprovider.audius.co/user_7\n \n\n' + ) + + # Validate that starts at user 8*1=8 and 8 user slugs + user_page_2 = get_user_page(session, 2, 8) + assert ( + user_page_2 + == b'\n \n https://discoveryprovider.audius.co/user_8\n \n \n https://discoveryprovider.audius.co/user_9\n \n \n https://discoveryprovider.audius.co/user_10\n \n \n https://discoveryprovider.audius.co/user_11\n \n \n https://discoveryprovider.audius.co/user_12\n \n \n https://discoveryprovider.audius.co/user_13\n \n \n https://discoveryprovider.audius.co/user_14\n \n \n https://discoveryprovider.audius.co/user_15\n \n\n' + ) + + # Validate that starts at user 8*2=16 and only 4 user slugs (The remainder with 8 max) + user_page_3 = get_user_page(session, 3, 8) + assert ( + user_page_3 + == b'\n \n https://discoveryprovider.audius.co/user_16\n \n \n https://discoveryprovider.audius.co/user_17\n \n \n https://discoveryprovider.audius.co/user_18\n \n \n https://discoveryprovider.audius.co/user_19\n \n\n' + ) diff --git a/discovery-provider/requirements.txt b/discovery-provider/requirements.txt index 785f7fb316e..d0a8e21b8d9 100644 --- a/discovery-provider/requirements.txt +++ b/discovery-provider/requirements.txt @@ -49,6 +49,7 @@ opentelemetry-instrumentation-logging==0.33b0 opentelemetry-sdk==1.12.0 opentelemetry-semantic-conventions==0.33b0 opentelemetry-util-http==0.33b0 +lxml==4.9.1 # Solana support base58==2.1.0 diff --git a/discovery-provider/src/queries/get_sitemap.py b/discovery-provider/src/queries/get_sitemap.py new file mode 100644 index 00000000000..6338dddd9a1 --- /dev/null +++ b/discovery-provider/src/queries/get_sitemap.py @@ -0,0 +1,224 @@ +import logging +from typing import List, Tuple + +from lxml import etree +from sqlalchemy import asc, func +from sqlalchemy.orm.session import Session +from src.models.playlists.playlist import Playlist +from src.models.tracks.track import Track +from src.models.tracks.track_route import TrackRoute +from src.models.users.user import User +from src.utils.get_all_other_nodes import get_node_endpoint + +logger = logging.getLogger(__name__) + + +root_site_maps_routes = [ + "defaults.xml", + "tracks/index.xml", + "collections/index.xml", + "users/index.xml", +] + + +def get_client_base_url(): + return "https://audius.co" + + +def set_base_url(): + endpoint = get_node_endpoint() + return endpoint + + +def create_client_url(route): + client_base = get_client_base_url() + return f"{client_base}/{route}" + + +def create_xml_url(route): + self_base = set_base_url() + return f"{self_base}/{route}" + + +default_routes = [ + # static + "legal/privacy-policy", + "legal/terms-of-use", + "download", + # app + "feed", + "trending", + "explore", + "explore/playlists", + "explore/underground", + "explore/top-albums", + "explore/remixables", + "explore/feeling-lucky", + "explore/chill", + "explore/upbeat", + "explore/intense", + "explore/provoking", + "explore/intimate", + "signup", + "signin", +] + +# The max number of urls that can be in a single sitemap +LIMIT = 50_000 + + +def build_default(): + root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9") + for site_map_route in default_routes: + sitemap_el = etree.Element("sitemap") + loc = etree.Element("loc") + loc.text = create_client_url(site_map_route) + sitemap_el.append(loc) + root.append(sitemap_el) + + # pretty string + return etree.tostring(root, pretty_print=True) + + +def get_max_track_count(session: Session) -> int: + max = ( + session.query(func.count(Track.track_id)) + .filter(Track.is_current == True, Track.stem_of == None) + .one() + ) + return max[0] + + +def get_max_user_count(session: Session) -> int: + max = ( + session.query(func.count(User.user_id)) + .filter(User.is_current == True, User.is_deactivated == False) + .one() + ) + return max[0] + + +def get_max_playlist_count(session: Session) -> int: + max = ( + session.query(func.count(Playlist.playlist_id)) + .filter( + Playlist.is_current == True, + Playlist.is_private == False, + Playlist.is_delete == False, + ) + .one() + ) + return max[0] + + +def get_dynamic_root(max: int, base_route: str, limit: int = LIMIT): + num_pages = (max // limit) + 1 if max % limit != 0 else int(max / limit) + root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9") + for num in range(num_pages): + sitemap_el = etree.Element("sitemap") + loc = etree.Element("loc") + loc.text = create_xml_url(f"sitemaps/{base_route}/{num+1}.xml") + sitemap_el.append(loc) + root.append(sitemap_el) + + return etree.tostring(root, pretty_print=True) + + +def get_entity_page(slugs: List[str]): + root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9") + for slug in slugs: + sitemap_url = etree.Element("url") + loc = etree.Element("loc") + loc.text = create_client_url(slug) + sitemap_url.append(loc) + root.append(sitemap_url) + return etree.tostring(root, pretty_print=True) + + +def get_track_slugs(session: Session, limit: int, offset: int): + slugs: List[str] = ( + session.query(User.handle_lc, TrackRoute.slug) + .join(Track, TrackRoute.track_id == Track.track_id) + .join(User, TrackRoute.owner_id == User.user_id) + .filter( + Track.is_current == True, Track.stem_of == None, User.is_current == True + ) + .order_by(asc(Track.track_id)) + .limit(limit) + .offset(offset) + .all() + ) + + return [f"{slug[0]}/{slug[1]}" for slug in slugs] + + +def get_playlist_slugs(session: Session, limit: int, offset: int): + playlists: List[Tuple[str, str, int, str]] = ( + session.query( + User.handle_lc, + Playlist.playlist_name, + Playlist.playlist_id, + Playlist.is_album, + ) + .join(User, User.user_id == Playlist.playlist_owner_id) + .filter(Playlist.is_current == True, Playlist.is_private == False) + .order_by(asc(Playlist.playlist_id)) + .limit(limit) + .offset(offset) + .all() + ) + slugs = [ + f"{p[0]}/{'album' if p[3] else 'playlist'}/{p[1]}-{p[2]}" for p in playlists + ] + + return slugs + + +def get_user_slugs(session: Session, limit: int, offset: int): + slugs = ( + session.query(User.handle_lc) + .filter( + User.is_current == True, + User.is_deactivated == False, + User.handle_lc != None, + ) + .order_by(asc(User.user_id)) + .limit(limit) + .offset(offset) + .all() + ) + + return [slug[0] for slug in slugs] + + +def get_track_root(session: Session, limit: int = LIMIT): + max_track_count = get_max_track_count(session) + return get_dynamic_root(max_track_count, "track", limit) + + +def get_playlist_root(session: Session, limit: int = LIMIT): + max_track_count = get_max_playlist_count(session) + return get_dynamic_root(max_track_count, "playlist", limit) + + +def get_user_root(session: Session, limit: int = LIMIT): + max_user_count = get_max_user_count(session) + return get_dynamic_root(max_user_count, "user", limit) + + +def get_track_page(session: Session, page: int, limit: int = LIMIT): + offset = (page - 1) * limit + slugs = get_track_slugs(session, limit, offset) + return get_entity_page(slugs) + + +def get_playlist_page(session: Session, page: int, limit: int = LIMIT): + offset = (page - 1) * limit + slugs = get_playlist_slugs(session, limit, offset) + return get_entity_page(slugs) + + +def get_user_page(session: Session, page: int, limit: int = LIMIT): + offset = (page - 1) * limit + slugs = get_user_slugs(session, limit, offset) + return get_entity_page(slugs) diff --git a/discovery-provider/src/queries/queries.py b/discovery-provider/src/queries/queries.py index 14407b46641..78ff68024a8 100644 --- a/discovery-provider/src/queries/queries.py +++ b/discovery-provider/src/queries/queries.py @@ -1,6 +1,7 @@ import logging # pylint: disable=C0302 +import re -from flask import Blueprint, request +from flask import Blueprint, Response, request from src import api_helpers, exceptions from src.queries.get_cid_source import get_cid_source from src.queries.get_feed import get_feed @@ -24,6 +25,15 @@ from src.queries.get_savers_for_playlist import get_savers_for_playlist from src.queries.get_savers_for_track import get_savers_for_track from src.queries.get_saves import get_saves +from src.queries.get_sitemap import ( + build_default, + get_playlist_page, + get_playlist_root, + get_track_page, + get_track_root, + get_user_page, + get_user_root, +) from src.queries.get_sol_plays import ( get_sol_play, get_total_aggregate_plays, @@ -44,6 +54,7 @@ from src.queries.get_users import get_users from src.queries.get_users_account import get_users_account from src.queries.query_helpers import get_current_user_id, get_pagination_vars +from src.utils.db_session import get_db_read_replica from src.utils.redis_metrics import record_metrics logger = logging.getLogger(__name__) @@ -645,3 +656,62 @@ def get_user_history_route(user_id): return api_helpers.success_response(user_history) except exceptions.ArgumentError as e: return api_helpers.error_response(str(e), 400) + + +@bp.route("/sitemaps/default.xml", methods=("GET",)) +def get_base_sitemap(): + try: + default_sitemap = build_default() + return Response(default_sitemap, mimetype="text/xml") + except exceptions.ArgumentError as e: + return api_helpers.error_response(str(e), 400) + + +@bp.route("/sitemaps//index.xml", methods=("GET",)) +def get_type_base_sitemap(type): + try: + db = get_db_read_replica() + with db.scoped_session() as session: + xml = "" + if type == "playlist": + xml = get_playlist_root(session) + elif type == "track": + xml = get_track_root(session) + elif type == "user": + xml = get_user_root(session) + else: + return api_helpers.error_response( + f"Invalid sitemap type {type}, should be one of playlist, track, user", + 400, + ) + return Response(xml, mimetype="text/xml") + except exceptions.ArgumentError as e: + return api_helpers.error_response(str(e), 400) + + +@bp.route("/sitemaps//", methods=("GET",)) +def get_type_sitemap_page(type: str, file_name: str): + try: + number = re.search("(\d+)\.xml$", file_name) # noqa: W605 + if not number: + return api_helpers.error_response( + f"Invalid filepath {file_name}, should be of format .xml", 400 + ) + page_number = int(number.group(1)) + db = get_db_read_replica() + with db.scoped_session() as session: + xml = "" + if type == "playlist": + xml = get_playlist_page(session, page_number) + elif type == "track": + xml = get_track_page(session, page_number) + elif type == "user": + xml = get_user_page(session, page_number) + else: + return api_helpers.error_response( + f"Invalid sitemap type {type}, should be one of playlist, track, user", + 400, + ) + return Response(xml, mimetype="text/xml") + except exceptions.ArgumentError as e: + return api_helpers.error_response(str(e), 400) diff --git a/discovery-provider/src/utils/get_all_other_nodes.py b/discovery-provider/src/utils/get_all_other_nodes.py index d67385379fb..2ac34841d7b 100644 --- a/discovery-provider/src/utils/get_all_other_nodes.py +++ b/discovery-provider/src/utils/get_all_other_nodes.py @@ -61,7 +61,7 @@ def get_node_endpoint() -> Optional[str]: try: node_info = future.result() wallet = node_info[3] - if wallet == shared_config["delegate"]["owner_wallet"]: + if wallet.lower() == shared_config["delegate"]["owner_wallet"].lower(): endpoint = node_info[1] break except Exception as e: