In [ ]:
import os
import sys

current_dir = os.getcwd()

parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)

from playlist_etl.config import *
from playlist_etl.extract import *
from playlist_etl.mongo_db_client import *
from playlist_etl.transform_playlist import *
from playlist_etl.utils import *
from playlist_etl.view_count import *

set_secrets()

In [None]:
class Aggregate:
    def __init__(self, mongo_client):
        self.mongo_client = mongo_client

    def aggregate(self, track_playlist):
        candidates_by_genre = self._group_by_genre(track_playlist)
        aggregate_by_isrc = self._group_by_isrc_within_genre(candidates_by_genre)
        matches = self._get_matches(aggregate_by_isrc)
        self._rank_matches(matches)

    def _group_by_genre(self, track_playlist) -> dict:
        """Group ISRC matches across the same genre from different services.
        Eg. output
        {
            "dance": {
                "23490580": {
                    "rank": 1,
                    "service_name": "soundcloud"
                },
                "23490580": {
                    "rank": 2,
                    "service_name": "spotify"
                }
            }
        }
        """
        candidates_by_genre = defaultdict(dict)
        for genre_name in GENRE_NAMES:
            for service_name in SERVICE_NAMES:
                track_playlist = self.track_playlist_collection.find(
                    {"service_name": service_name, "genre_name": genre_name}
                )
                for track in track_playlist["tracks"]:
                    candidates_by_genre[genre_name][track["isrc"]] = {
                        "rank": track["rank"],
                        "service_name": service_name,
                    }
        return candidates_by_genre

    def _group_by_isrc_within_genre(self, candidate_by_genre):
        """Groups ISRC matches across the same genre. Two different playlists can have the same ISRC.
        Eg. output
        {
            "dance": {
                "23490580": {
                    [("soundcloud", 1)]
                },
                "23434580": {
                    [("soundcloud", 1), ("spotify", 2)]
                }
            }
            "pop": {
                "23490523": {
                    [("soundcloud", 1), ("spotify", 2)]
                },
                "23490580": {
                    [("soundcloud", 1)]
                }
        }
        """
        aggregate_by_isrc = defaultdict(lambda: defaultdict(list))
        for genre, candidates in candidate_by_genre.items():
            for isrc, candidate in candidates.items():
                aggregate_by_isrc[genre][isrc].append(
                    (candidate["service_name"], candidate["rank"])
                )
        return aggregate_by_isrc

    def _get_matches(self, aggregate_by_isrc):
        """Get ISRCs that come up more than once for the same genre.
        Eg. input
         {
            "pop": {
                "23490523": {
                    [("soundcloud", 1), ("spotify", 2)]
                },
                "23490580": {
                    [("soundcloud", 1)]
                }
        }
        Eg. output:
        {
            "pop": {
                "23490523": {
                    [("soundcloud", 1), ("spotify", 2)]
        }
        """
        matches = {}
        for genre, isrcs in aggregate_by_isrc.items():
            matches[genre] = {isrc: sources for isrc, sources in isrcs.items() if len(sources) > 1}
        return matches

    def _rank_matches(self, matches):
        """Rank matches based on the rank of the services.
        Eg. input
        {
            "pop": {
                "23490523": {
                    [("soundcloud", 1), ("spotify", 2)]
                },
        }
        Eg. output
        {
            "dance": {
                1: {
                    ISRC: "23490523",
                    sources: [("soundcloud", 1), ("spotify", 2)]
                }
            }
        }
        """
        rank_priority = ["AppleMusic", "SoundCloud", "Spotify"]
        for genre_name, matches_by_genre in matches.items():
            for isrc, sources in matches_by_genre.items():
                ranked_sources = sorted(sources, key=lambda x: rank_priority.index(x[0]))
                matches[genre_name][isrc] = {"isrc": isrc, "sources": ranked_sources}

        return matches

In [20]:
aggregate = Aggregate(MongoDBClient())
a, b = aggregate._get_track_data_from_mongo()

11-19 16:16:35 - INFO - set_secrets() - env_path/Users/eli/github/tunemeld/.env.dev


In [16]:
aggregate = Aggregate(mongo_client)
aggregate.aggregate()

TypeError: 'Database' object is not callable

In [10]:
mongo_client.read_data(TRACK_PLAYLIST_COLLECTION)

[{'_id': ObjectId('673bd1cf7fc0f4ca6df0f081'),
  'key': 'AppleMusic_country',
  'value': [{'isrc': 'US6XF2400213', 'rank': 1},
   {'isrc': 'USUM72408405', 'rank': 2},
   {'isrc': 'USUG12406387', 'rank': 3},
   {'isrc': 'USUM72405599', 'rank': 4},
   {'isrc': 'QMRSZ2400822', 'rank': 5},
   {'isrc': 'USUYG1580113', 'rank': 6},
   {'isrc': 'QM24S2401097', 'rank': 7},
   {'isrc': 'QMDA62488992', 'rank': 8},
   {'isrc': 'USUM72410290', 'rank': 9},
   {'isrc': 'QMCQK1900370', 'rank': 10},
   {'isrc': 'USSM12403681', 'rank': 11},
   {'isrc': 'USRN12400011', 'rank': 12},
   {'isrc': 'QZRD92405271', 'rank': 13},
   {'isrc': 'USAN20000251', 'rank': 14},
   {'isrc': 'USUG12405562', 'rank': 15},
   {'isrc': 'USRN12300107', 'rank': 16},
   {'isrc': 'USAT22402805', 'rank': 17},
   {'isrc': 'USUG12404551', 'rank': 18},
   {'isrc': 'USWB12403709', 'rank': 19},
   {'isrc': 'USWB12402402', 'rank': 20},
   {'isrc': 'USAT22408980', 'rank': 21},
   {'isrc': 'USWB12404018', 'rank': 22},
   {'isrc': 'QMRSZ24