In [1]:
from py2neo import Graph
import pandas as pd
import sys
import numpy as np
import boto3
import psycopg2
import db_utils
import tqdm
import matplotlib

Strategy based on such article: https://medium.com/@crimy/one-shot-learning-siamese-networks-and-triplet-loss-with-keras-2885ed022352

# 1. Extract data of loaded images on S3

In [2]:
def query_results_to_df(query_results):
    if len(query_results) == 0:
        return False
    cols = ["instance_id","stat","track_id","win","ini","fin","rows","cols","date"]
    return pd.DataFrame(query_results, columns=cols)

In [3]:
query_status = """ 
SELECT * FROM status_specto
"""
ENDPOINT="tracksurl.czjs6btlvfgd.eu-west-2.rds.amazonaws.com"
PORT="5432"
USR="david"
REGION="eu-west-2"
DBNAME="postgres"
PSSWD=["qrks","jfut","iv","uf","1"]

conn = psycopg2.connect(host=ENDPOINT, port=PORT, database=DBNAME, user=USR, password=''.join(PSSWD))
cur = conn.cursor()
cur.execute(query_status)
query_results = cur.fetchall()
df_status = query_results_to_df(query_results)
conn.close()

## 1.1 Filter

### 1.1.1 Images that are not well-shaped

In [4]:
df = df_status[df_status["cols"].isin({937, 938})]

In [5]:
df.head()

Unnamed: 0,instance_id,stat,track_id,win,ini,fin,rows,cols,date
0,i-046cdfe3763e0b089,1,000xQL6tZNLJzIrtIgxqSl,0,0,30,256,937,2020-07-05 21:47:21
1,i-046cdfe3763e0b089,1,000xQL6tZNLJzIrtIgxqSl,1,20,50,256,937,2020-07-05 21:47:21
2,i-046cdfe3763e0b089,1,000xQL6tZNLJzIrtIgxqSl,3,60,90,256,937,2020-07-05 21:47:21
3,i-046cdfe3763e0b089,1,000xQL6tZNLJzIrtIgxqSl,2,40,70,256,937,2020-07-05 21:47:21
4,i-046cdfe3763e0b089,1,000xQL6tZNLJzIrtIgxqSl,4,80,110,256,937,2020-07-05 21:47:21


In [95]:
df.shape

NameError: name 'df' is not defined

In [7]:
set_tracks = set(df.track_id)

In [8]:
print(len(set_tracks))

56222


# 2. Dataset Creation (train/test) and Label (based on Neo4j Graph)

In [9]:
import db_utils

In [10]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [11]:
# Neo4j
graph = Graph(bolt = True, host = "localhost", name = "Spotify", user = "neo4j", password = "qrks")

## 2.1 Path 1) Songs from top genres

This is like a PoC, we will:
- take the most popular genres, top 5 genres, drop the undefined genre
- look for the artists for each genre
- get all the songs for that artist
- check if they are in our database
- do triplets of songs: select one anchor ( song A1 from artist A), then select as the postive pair of the triplet another song from that artist (A2). Then choose one song from a related artist of A (i.e artist B). Take as the negative pair of the triplet the B1 song for example. Hence the triplet will be formet by A1-A2-B1. This poses a much more difficult task since related artists will tend to share features in their songs so that will be more challenging rather than taking as the negative pair any random song.

### a) SELECT top genres

In [12]:
query = """
SELECT * FROM (
select genre, count(distinct artist_id) as num_art from master_genre GROUP BY genre) x1
WHERE num_art > 100
ORDER BY num_art DESC;
"""

In [13]:
df_genres = db_utils.select_table(cursor, direct_query=query,
                                 sel_cols=["track_id", "num_art"])
df_genres.drop_duplicates(inplace=True)
conn.close()

In [14]:
# Remove the undefined genre
df_genres = df_genres[df_genres["track_id"] != "undefined"]

# Sort genres by the number of artists
df_genres.sort_values("num_art", ascending = False)

# Take the top 5
top_genres = df_genres.head(5)

In [15]:
top_genres

Unnamed: 0,track_id,num_art
1,pop,479
2,dance pop,393
3,latin,264
4,rap,239
5,tropical house,233


In [25]:
cypher_query_triplets = """


MATCH (g1:Genre)-[:GEN_ART]->(a1:Artist)-[:REL_ART]->(a2:Artist)<-[:GEN_ART]-(g2:Genre)
WHERE g1.genre_id IN ["pop", "dance pop", "latin", "rap", "tropical house"]
    AND g2.genre_id IN ["pop", "dance pop", "latin", "rap", "tropical house"]
    WHERE g1.genre_id <> g2.genre_id
WITH a1, a2, g1, g2
MATCH  (t1:Track)<-[:ART_TR]-(a1:Artist),
       (t2:Track)<-[:ART_TR]-(a1:Artist),
       (t3:Track)<-[:ART_TR]-(a2:Artist)
WHERE t1.track_id <> t2.track_id AND t1.track_id <> t3.track_id AND t2.track_id <> t3.track_id
RETURN t1.track_id, t2.track_id, a1.artist_id, t3.track_id, a2.artist_id, t1.track_name, t2.track_name,
        a1.artist_name, t3.track_name, a2.artist_name, g1.genre_id, g2.genre_id

"""


In [28]:
cursor_neo4j = graph.run(cypher_query_triplets)

In [26]:
df_triplets = pd.DataFrame.from_records(cursor_neo4j, columns=cursor_neo4j.keys())

AttributeError: 'psycopg2.extensions.cursor' object has no attribute 'keys'

### b) SELECT top artists

In [2]:
query_top_artists = """
MATCH (a1:Artist)-[:ART_TR]->(t1:Track)
WHERE t1.yt_views > 0
RETURN a1.artist_id, a1.artist_name
ORDER BY t1.yt_views DESC
LIMIT 100
"""

In [78]:
query_top_artists = """


CALL {

    CALL {
        MATCH (a1:Artist)-[:ART_TR]->(t1:Track)
        WHERE t1.yt_views > 0
        RETURN a1
        ORDER BY t1.yt_views DESC
        LIMIT 100
        }

    RETURN COLLECT(a1.artist_id) as top_art

}

MATCH  (t1:Track)<-[:ART_TR]-(a2:Artist),
       (t2:Track)<-[:ART_TR]-(a2:Artist),
       (t3:Track)<-[:ART_TR]-(a3:Artist)
WHERE t1.track_id <> t2.track_id AND t1.track_id <> t3.track_id AND t2.track_id <> t3.track_id
     AND a2.artist_id IN top_art AND a3.artist_id IN top_art AND a2.artist_id <> a3.artist_id
RETURN t1.track_id, t2.track_id, a2.artist_id, t3.track_id, a3.artist_id, t1.track_name, t2.track_name,
        a2.artist_name, t3.track_name, a3.artist_name

"""

In [79]:
cursor_neo4j = graph.run(query_top_artists)
df_topart = pd.DataFrame.from_records(cursor_neo4j, columns=cursor_neo4j.keys())

KeyboardInterrupt: 

In [42]:
df_topart["a1.artist_name"].unique()

array(['Daddy Yankee', 'Luis Fonsi', 'Pinkfong', 'Ed Sheeran',
       'Major Lazer', 'Nyla', 'Kranium', 'Charlie Puth', 'Wiz Khalifa',
       'Bruno Mars', 'Mark Ronson', 'Justin Bieber', 'Maroon 5',
       'Katy Perry', 'OneRepublic', 'Taylor Swift', 'MØ', 'Farruko',
       'J Balvin', 'DJ Snake', 'Juicy J', 'Alan Walker', 'Passenger',
       'Beyoncé', 'Willy William', 'Cardi B', 'Adele', 'Maluma',
       'Shakira', 'Halsey', 'The Chainsmokers', 'Selena Gomez',
       'Meghan Trainor', 'Clean Bandit', 'Sean Paul', 'Anne-Marie',
       'Rihanna', 'Calvin Harris', 'Fifth Harmony', 'Ty Dolla $ign',
       'Sia', 'Ludacris', 'Casper Magico', 'Nicky Jam', 'Nio Garcia',
       'Ozuna', 'Bad Bunny', 'Darell', 'Dua Lipa', 'Ellie Goulding',
       'Natti Natasha', 'Twenty One Pilots', 'Demi Lovato', 'Kafu Banton',
       'El Micha', 'Pedro Capó', 'Snow', 'Shawn Mendes', 'GoonRock',
       'Kid Ink', 'Becky G', 'MAGIC!', 'Coldplay', 'Ariana Grande',
       'Nicki Minaj', 'Marshmello', 'Daft Pu

## 2.2 Path 2) Artist sharing

# 3. Manually creation of the dataset

We have seen that pairing songs with a query can be very difficult, hence, we will design a pipeline to pair the triplets, two songs for the same artists with (two paired songs can be the same song - remind that there are different windows for each song)

In [3]:
import db_utils
from collections import defaultdict
import itertools
import random

In [4]:
def query_results_to_df(query_results, cols = ["instance_id","stat","track_id","win","ini","fin","rows","cols","date"]):
    if len(query_results) == 0:
        return False
    return pd.DataFrame(query_results, columns=cols)

In [6]:
# Neo4j
graph = Graph(bolt = True, host = "localhost", name = "Spotify", user = "neo4j", password = "qrks")

## 3.1 List of selected artists

In [8]:
query_top_artists = """
MATCH (a1:Artist)-[:ART_TR]->(t1:Track)
WHERE t1.yt_views > 0
RETURN a1.artist_id, a1.artist_name
ORDER BY t1.yt_views DESC
LIMIT 100
"""
cursor_neo4j = graph.run(query_top_artists)
df_topart = pd.DataFrame.from_records(cursor_neo4j, columns=cursor_neo4j.keys())
top_artists = set(df_topart["a1.artist_id"])

In [9]:
len(top_artists)

68

## 3.2 Creation of dictionaries

### 3.2.1 Tracks for each artist

In [10]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [11]:
query_art_tr = """
SELECT * FROM rel_artist_track
"""
cols = ["artist_id", "track_id"]

cursor.execute(query_art_tr)
query_results = cursor.fetchall()
df_all_art_tr = query_results_to_df(query_results, cols = cols)
cursor.close()

# Select only the top artists
df_top_art_tr = df_all_art_tr[df_all_art_tr["artist_id"].isin(top_artists)]

# Get the set of all artists
set_all_art = set(df_all_art_tr["artist_id"])

# Create a dictionary for all the tracks of top artist: a1
a1_tracks = defaultdict(set)
for i, row in df_top_art_tr.iterrows():
    a1_tracks[row.artist_id].add(row.track_id)

# Create a dictionary for all the tracks of all artist: a2
a2_tracks = defaultdict(set)
for i, row in tqdm.tqdm_notebook(df_all_art_tr.iterrows()):
    a2_tracks[row.artist_id].add(row.track_id)

### 3.2.2 Related artists

In [17]:
query_rel_art = """
SELECT DISTINCT query, rel_art FROM rel_artist_artist
"""
cols = ["query", "rel_art"]

In [18]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [19]:
cursor.execute(query_rel_art)
query_results = cursor.fetchall()
df_rel_art_art = query_results_to_df(query_results, cols = cols)
cursor.close()

In [20]:
df_rel_art_art.head()

Unnamed: 0,query,rel_art
0,66ko2UINS5X1TRK48kdsyh,5oNWzcU0mYK1zDUxBGHIaG
1,5xd2Tg7Zo8755eCy8Gxkp8,1dJyh390MvfYPuNbhnbSDs
2,4c5YsU2iQX3LvAKdPa0A8P,14bt7qAPOSPw76Fc1WFGM6
3,0gCGZZ1Ibo5QsOnll977PD,1TlOcOaTgKTolKfAUeMHgM
4,3SRes7eoE3xRodLopFKXWb,52wHLsSLdkVMLYo2ZXP8y6


In [21]:
# Create a dictionary to pair each artist with its related one
# Relationships are bidirectional
art_rels = defaultdict(set)
for i, row in tqdm.tqdm_notebook(df_rel_art_art.iterrows()):
    art_rels[row.query].add(row.rel_art)
    art_rels[row.rel_art].add(row.query)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## 3.3 Pairing algorithm

In [22]:
list_top_artist = list(top_artists)

In [34]:
a1 = list_top_artist[1]

In [88]:
def create_triplet(a1):
    
    
    # -------------------------------------------- #
    #     List all tracks for that artist a1
    # -------------------------------------------- #
    tr_a1 = a1_tracks[a1]
    tr_a1_list = list(tr_a1)

    # -------------------------------------------- #
    #        Track pairs for a1
    # -------------------------------------------- #
    
    # Make all possible pairs (admiting track repetition) of tracks (l1)
    l1_pairs = list(itertools.product(tr_a1_list, repeat=2))

    # -------------------------------------------- #
    #        Related artists
    # -------------------------------------------- #
    
    # Select all related artists
    if a1 in art_rels:
        rel_art_a1 = art_rels[a1]
    else:
        # --------------------------- #
        #  If has NOT related artists
        # --------------------------- #
        # Select 20 random artists different than a1
        rel_art_a1 = set()

        while len(rel_art_a1) < 20:
            
            # Get randomly a different artist than a1
            set_diff_art_than_a1 = set_all_art - set(a1)
            art_iter = random.sample(set_diff_art_than_a1, 1)[0]

            # Add it to the rel_art_a1 until we have 10 artists
            rel_art_a1.add(art_iter)

            
            
    # -------------------------------------------- #
    #      Loop over related artists
    # -------------------------------------------- #            
            
    # Get all the songs for each of the rel_art_a1
    rel_art_a1 = list(rel_art_a1)
    tr_a2_list = list()

    # For each related artist (a2) of a1
    for aa2 in rel_art_a1:
        
        # -------------------------------------------- #
        #    Tracks of a2
        # -------------------------------------------- #    

        # If we have tracks for that artist
        if aa2 in a2_tracks:

            # List such tracks
            set_aa2_tracks = a2_tracks[aa2]
            aa2_tracks = list(set_aa2_tracks)

            
             # ----------------------------------------------------------------------- #
            #   Avoid picking a feature song shared for a1 and a2 as the negative pair
            # ----------------------------------------------------------------------- #  
            # Select only songs that are not in the set of songs of a1 (tr_a1)
            sel_tracks_aa2 = set_aa2_tracks - tr_a1

            # Append them to the list
            tr_a2_list.extend(list(sel_tracks_aa2))

    # ----------------------------------------------------------------- #
    #  Coincide pairs of tracks of a1 length with tracks of rel_artists
    # ----------------------------------------------------------------- #    
    
    # Make each pair of l1_pairs to coincide with tracks for the related artists of a1 (tr_a2_list)
    len_l1_pairs = len(l1_pairs)
    len_tr_a2_list = len(tr_a2_list)

    # Create the negative list of tracks for rel_artist of the same length as the pairs of tracks
    negative_list = random.choices(tr_a2_list, k = len_l1_pairs)

    # pair all together
    triplets = zip(l1_pairs, negative_list)


    return triplets

In [89]:
%%time
triplets = create_triplet(list_top_artist[6])

CPU times: user 2.77 ms, sys: 50 µs, total: 2.82 ms
Wall time: 2.83 ms


In [90]:
tr = tuple(triplets)

In [94]:
query = cursor.executemany("""insert into REL_ARTIST_ARTIST VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING; """ , values)
conn.commit()

(('2iCOdoyDmWG3hWkbtH1KB2', '58q2HKrzhC3ozto2nDdN4z'),
 '1AVu7Kc2MRrLfOG1RCEf07')

# DEV

In [35]:
# List all tracks for that artist a1
tr_a1 = a1_tracks[a1]
tr_a1_list = list(tr_a1)

In [36]:
# Make all possible pairs (admiting track repetition) of tracks (l1)
l1_pairs = list(itertools.product(tr_a1_list, repeat=2))

In [37]:
# Select all related artists
if a1 in art_rels:
    rel_art_a1 = art_rels[a1]
else:
    # Select 20 random artists different than a1
    rel_art_a1 = set()
    
    while len(rel_art_a1) < 20:
        # Get randomly a different artist than a1
        set_diff_art_than_a1 = set_all_art - set(a1)
        art_iter = random.sample(set_diff_art_than_a1, 1)[0]
        
        # Add it to the rel_art_a1 until we have 10 artists
        rel_art_a1.add(art_iter)

In [74]:
# Get all the songs for each of the rel_art_a1
rel_art_a1 = list(rel_art_a1)
tr_a2_list = list()

# For each related artist (a2) of a1
for aa2 in rel_art_a1:
    
    # If we have tracks for that artist
    if aa2 in a2_tracks:
        
        # List such tracks
        set_aa2_tracks = a2_tracks[aa2]
        aa2_tracks = list(set_aa2_tracks)
        
        # Select only songs that are not in the set of songs of a1 (tr_a1)
        sel_tracks_aa2 = set_aa2_tracks - tr_a1
        
        # Append them to the list
        tr_a2_list.extend(list(sel_tracks_aa2))

In [75]:
# Make each pair of l1_pairs to coincide with tracks for the related artists of a1 (tr_a2_list)
len_l1_pairs = len(l1_pairs)
len_tr_a2_list = len(tr_a2_list)

# Create the negative list of tracks for rel_artist of the same length as the pairs of tracks
negative_list = random.choices(tr_a2_list, k = len_l1_pairs)

# pair all together
triplets = tuple(zip(l1_pairs, negative_list))

In [76]:
triplets


((('4aWmUDTfIPGksMNLV2rQP2', '4aWmUDTfIPGksMNLV2rQP2'),
  '38I8OemgIPYHdP5uBNY8BO'),
 (('4aWmUDTfIPGksMNLV2rQP2', '1AkTW13ysu0AJrwuM6UY0I'),
  '5YzA563GXTuwQaRq24z1k5'),
 (('4aWmUDTfIPGksMNLV2rQP2', '6nqAD8ch0ZUUFojMFf2xdT'),
  '782JmkLy6uVD8MW0EJrih9'),
 (('4aWmUDTfIPGksMNLV2rQP2', '4YW2Fr25fHwUt6qgvM1iE1'),
  '3iT5FD6ZtGffWLyFgdEO65'),
 (('4aWmUDTfIPGksMNLV2rQP2', '58IL315gMSTD37DOZPJ2hf'),
  '0YbXuCsQwrsh5x0hgnED6z'),
 (('4aWmUDTfIPGksMNLV2rQP2', '27bP0pI331DN68c70THzso'),
  '6041gM9Th1ViQcnzkIxnuv'),
 (('4aWmUDTfIPGksMNLV2rQP2', '4vlbjBSMqycZk6t4HVRpnC'),
  '6C1RD7YQVvt3YQj0CmuTeu'),
 (('4aWmUDTfIPGksMNLV2rQP2', '7lEhOiFtcLXY4P7cuTKuQ8'),
  '0wsNs9SklQsrdepDTYYMom'),
 (('4aWmUDTfIPGksMNLV2rQP2', '6habFhsOp2NvshLv26DqMb'),
  '5GVvn3XvDtcXJpfN0GmtUp'),
 (('4aWmUDTfIPGksMNLV2rQP2', '5Hiu520lWG4YPrMugnNnkP'),
  '67hB1EzLTxvAGDFkbx5AVi'),
 (('4aWmUDTfIPGksMNLV2rQP2', '1c0hsvHLELX6y8qymnpLKL'),
  '6P00nbm7byCND2pgWkNopd'),
 (('4aWmUDTfIPGksMNLV2rQP2', '4okba5wu9mMLXx79DXLKi3'),
  '1MZq8v