Skip to content

Commit

Permalink
Add Discovery Sitemap Endpoint (#3973)
Browse files Browse the repository at this point in the history
* Add sitemap methods to discovery

* Fix sitemap urls and add tests

* rename route for sitemap to be more consistent

* Fix double set

* Fix integration test
  • Loading branch information
jowlee committed Oct 4, 2022
1 parent 29dafcd commit f873142
Show file tree
Hide file tree
Showing 5 changed files with 419 additions and 2 deletions.
122 changes: 122 additions & 0 deletions discovery-provider/integration_tests/queries/test_get_sitemap.py
@@ -0,0 +1,122 @@
import logging
from unittest import mock

from integration_tests.utils import populate_mock_db
from src.queries.get_sitemap import (
build_default,
get_playlist_page,
get_playlist_root,
get_track_page,
get_track_root,
get_user_page,
get_user_root,
)
from src.utils.db_session import get_db

logger = logging.getLogger(__name__)


@mock.patch("src.queries.get_sitemap.get_client_base_url")
@mock.patch("src.queries.get_sitemap.set_base_url")
def test_get_sitemaps(mock_set_base_url, mock_get_client_base_url, app):
"""Tests that get sitemap works"""
with app.app_context():
db = get_db()

mock_set_base_url.return_value = "https://discoveryprovider.audius.co"
mock_get_client_base_url.return_value = "https://discoveryprovider.audius.co"

test_entities = {
"playlists": [
{
"playlist_id": i,
"playlist_owner_id": i,
"playlist_name": f"p_name_{i}",
"is_album": i % 2 == 0,
}
for i in range(10)
],
"tracks": [{"track_id": i, "owner_id": i} for i in range(10)],
"track_routes": [
{
"track_id": i,
"owner_id": i,
"slug": f"slug_{i}",
"title_slug": f"title_slug_{i}",
}
for i in range(10)
],
"users": [{"user_id": i, "handle": f"user_{i}"} for i in range(20)],
}

populate_mock_db(db, test_entities)

with db.scoped_session() as session:
default_sitemap = build_default()
assert (
default_sitemap
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <sitemap>\n <loc>https://discoveryprovider.audius.co/legal/privacy-policy</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/legal/terms-of-use</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/download</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/feed</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/trending</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/playlists</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/underground</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/top-albums</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/remixables</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/feeling-lucky</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/chill</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/upbeat</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/intense</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/provoking</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/explore/intimate</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/signup</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/signin</loc>\n </sitemap>\n</urlset>\n'
)

# Validate that there are 7 track sitemaps - 10 total track / 3 user per sitemap = 4
track_root = get_track_root(session, 3)
assert (
track_root
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/track/1.xml</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/track/2.xml</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/track/3.xml</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/track/4.xml</loc>\n </sitemap>\n</urlset>\n'
)

# Validate that there are 6 playlist sitemaps - 10 total playlist / 2 user per sitemap = 5
playlist_root = get_playlist_root(session, 2)
assert (
playlist_root
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/playlist/1.xml</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/playlist/2.xml</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/playlist/3.xml</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/playlist/4.xml</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/playlist/5.xml</loc>\n </sitemap>\n</urlset>\n'
)

# Validate that there are 2 user sitemaps - 20 total user / 12 user per sitemap = 2
user_root = get_user_root(session, 12)
assert (
user_root
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/user/1.xml</loc>\n </sitemap>\n <sitemap>\n <loc>https://discoveryprovider.audius.co/sitemaps/user/2.xml</loc>\n </sitemap>\n</urlset>\n'
)

# Validate that returns 6 track slugs
track_page_1 = get_track_page(session, 1, 6)
assert (
track_page_1
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <url>\n <loc>https://discoveryprovider.audius.co/user_0/slug_0</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_1/slug_1</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_2/slug_2</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_3/slug_3</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_4/slug_4</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_5/slug_5</loc>\n </url>\n</urlset>\n'
)

# Validate that returns remained 4 track slugs and starts at 6
track_page_2 = get_track_page(session, 2, 6)
assert (
track_page_2
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <url>\n <loc>https://discoveryprovider.audius.co/user_6/slug_6</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_7/slug_7</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_8/slug_8</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_9/slug_9</loc>\n </url>\n</urlset>\n'
)

# Validate that returns all playlist(total of 10)
playlist_page_1 = get_playlist_page(session, 1, 100)
assert (
playlist_page_1
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <url>\n <loc>https://discoveryprovider.audius.co/user_0/album/p_name_0-0</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_1/playlist/p_name_1-1</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_2/album/p_name_2-2</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_3/playlist/p_name_3-3</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_4/album/p_name_4-4</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_5/playlist/p_name_5-5</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_6/album/p_name_6-6</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_7/playlist/p_name_7-7</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_8/album/p_name_8-8</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_9/playlist/p_name_9-9</loc>\n </url>\n</urlset>\n'
)

# Validate that starts at user 0 and 8 user slugs
user_page_1 = get_user_page(session, 1, 8)
assert (
user_page_1
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <url>\n <loc>https://discoveryprovider.audius.co/user_0</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_1</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_2</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_3</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_4</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_5</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_6</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_7</loc>\n </url>\n</urlset>\n'
)

# Validate that starts at user 8*1=8 and 8 user slugs
user_page_2 = get_user_page(session, 2, 8)
assert (
user_page_2
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <url>\n <loc>https://discoveryprovider.audius.co/user_8</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_9</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_10</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_11</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_12</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_13</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_14</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_15</loc>\n </url>\n</urlset>\n'
)

# Validate that starts at user 8*2=16 and only 4 user slugs (The remainder with 8 max)
user_page_3 = get_user_page(session, 3, 8)
assert (
user_page_3
== b'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n <url>\n <loc>https://discoveryprovider.audius.co/user_16</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_17</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_18</loc>\n </url>\n <url>\n <loc>https://discoveryprovider.audius.co/user_19</loc>\n </url>\n</urlset>\n'
)
1 change: 1 addition & 0 deletions discovery-provider/requirements.txt
Expand Up @@ -49,6 +49,7 @@ opentelemetry-instrumentation-logging==0.33b0
opentelemetry-sdk==1.12.0
opentelemetry-semantic-conventions==0.33b0
opentelemetry-util-http==0.33b0
lxml==4.9.1

# Solana support
base58==2.1.0
Expand Down
224 changes: 224 additions & 0 deletions discovery-provider/src/queries/get_sitemap.py
@@ -0,0 +1,224 @@
import logging
from typing import List, Tuple

from lxml import etree
from sqlalchemy import asc, func
from sqlalchemy.orm.session import Session
from src.models.playlists.playlist import Playlist
from src.models.tracks.track import Track
from src.models.tracks.track_route import TrackRoute
from src.models.users.user import User
from src.utils.get_all_other_nodes import get_node_endpoint

logger = logging.getLogger(__name__)


root_site_maps_routes = [
"defaults.xml",
"tracks/index.xml",
"collections/index.xml",
"users/index.xml",
]


def get_client_base_url():
return "https://audius.co"


def set_base_url():
endpoint = get_node_endpoint()
return endpoint


def create_client_url(route):
client_base = get_client_base_url()
return f"{client_base}/{route}"


def create_xml_url(route):
self_base = set_base_url()
return f"{self_base}/{route}"


default_routes = [
# static
"legal/privacy-policy",
"legal/terms-of-use",
"download",
# app
"feed",
"trending",
"explore",
"explore/playlists",
"explore/underground",
"explore/top-albums",
"explore/remixables",
"explore/feeling-lucky",
"explore/chill",
"explore/upbeat",
"explore/intense",
"explore/provoking",
"explore/intimate",
"signup",
"signin",
]

# The max number of urls that can be in a single sitemap
LIMIT = 50_000


def build_default():
root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for site_map_route in default_routes:
sitemap_el = etree.Element("sitemap")
loc = etree.Element("loc")
loc.text = create_client_url(site_map_route)
sitemap_el.append(loc)
root.append(sitemap_el)

# pretty string
return etree.tostring(root, pretty_print=True)


def get_max_track_count(session: Session) -> int:
max = (
session.query(func.count(Track.track_id))
.filter(Track.is_current == True, Track.stem_of == None)
.one()
)
return max[0]


def get_max_user_count(session: Session) -> int:
max = (
session.query(func.count(User.user_id))
.filter(User.is_current == True, User.is_deactivated == False)
.one()
)
return max[0]


def get_max_playlist_count(session: Session) -> int:
max = (
session.query(func.count(Playlist.playlist_id))
.filter(
Playlist.is_current == True,
Playlist.is_private == False,
Playlist.is_delete == False,
)
.one()
)
return max[0]


def get_dynamic_root(max: int, base_route: str, limit: int = LIMIT):
num_pages = (max // limit) + 1 if max % limit != 0 else int(max / limit)
root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for num in range(num_pages):
sitemap_el = etree.Element("sitemap")
loc = etree.Element("loc")
loc.text = create_xml_url(f"sitemaps/{base_route}/{num+1}.xml")
sitemap_el.append(loc)
root.append(sitemap_el)

return etree.tostring(root, pretty_print=True)


def get_entity_page(slugs: List[str]):
root = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for slug in slugs:
sitemap_url = etree.Element("url")
loc = etree.Element("loc")
loc.text = create_client_url(slug)
sitemap_url.append(loc)
root.append(sitemap_url)
return etree.tostring(root, pretty_print=True)


def get_track_slugs(session: Session, limit: int, offset: int):
slugs: List[str] = (
session.query(User.handle_lc, TrackRoute.slug)
.join(Track, TrackRoute.track_id == Track.track_id)
.join(User, TrackRoute.owner_id == User.user_id)
.filter(
Track.is_current == True, Track.stem_of == None, User.is_current == True
)
.order_by(asc(Track.track_id))
.limit(limit)
.offset(offset)
.all()
)

return [f"{slug[0]}/{slug[1]}" for slug in slugs]


def get_playlist_slugs(session: Session, limit: int, offset: int):
playlists: List[Tuple[str, str, int, str]] = (
session.query(
User.handle_lc,
Playlist.playlist_name,
Playlist.playlist_id,
Playlist.is_album,
)
.join(User, User.user_id == Playlist.playlist_owner_id)
.filter(Playlist.is_current == True, Playlist.is_private == False)
.order_by(asc(Playlist.playlist_id))
.limit(limit)
.offset(offset)
.all()
)
slugs = [
f"{p[0]}/{'album' if p[3] else 'playlist'}/{p[1]}-{p[2]}" for p in playlists
]

return slugs


def get_user_slugs(session: Session, limit: int, offset: int):
slugs = (
session.query(User.handle_lc)
.filter(
User.is_current == True,
User.is_deactivated == False,
User.handle_lc != None,
)
.order_by(asc(User.user_id))
.limit(limit)
.offset(offset)
.all()
)

return [slug[0] for slug in slugs]


def get_track_root(session: Session, limit: int = LIMIT):
max_track_count = get_max_track_count(session)
return get_dynamic_root(max_track_count, "track", limit)


def get_playlist_root(session: Session, limit: int = LIMIT):
max_track_count = get_max_playlist_count(session)
return get_dynamic_root(max_track_count, "playlist", limit)


def get_user_root(session: Session, limit: int = LIMIT):
max_user_count = get_max_user_count(session)
return get_dynamic_root(max_user_count, "user", limit)


def get_track_page(session: Session, page: int, limit: int = LIMIT):
offset = (page - 1) * limit
slugs = get_track_slugs(session, limit, offset)
return get_entity_page(slugs)


def get_playlist_page(session: Session, page: int, limit: int = LIMIT):
offset = (page - 1) * limit
slugs = get_playlist_slugs(session, limit, offset)
return get_entity_page(slugs)


def get_user_page(session: Session, page: int, limit: int = LIMIT):
offset = (page - 1) * limit
slugs = get_user_slugs(session, limit, offset)
return get_entity_page(slugs)

0 comments on commit f873142

Please sign in to comment.