In [None]:
import db_utils
from collections import defaultdict
import itertools
import random
from py2neo import Graph
import pandas as pd
import sys
import numpy as np
import boto3
import psycopg2
import db_utils
import tqdm
import matplotlib

In [7]:
# Neo4j
graph = Graph(bolt = True, host = "localhost", name = "Spotify", user = "neo4j", password = "qrks")

# 1. Extract data of loaded spectograms on S3

In [2]:
def query_results_to_df(query_results):
    if len(query_results) == 0:
        return False
    cols = ["instance_id","stat","track_id","win","ini","fin","rows","cols","date"]
    return pd.DataFrame(query_results, columns=cols)

In [3]:
query_status = """ 
SELECT * FROM status_specto
"""
ENDPOINT="tracksurl.czjs6btlvfgd.eu-west-2.rds.amazonaws.com"
PORT="5432"
USR="david"
REGION="eu-west-2"
DBNAME="postgres"
PSSWD=["qrks","jfut","iv","uf","1"]

conn = psycopg2.connect(host=ENDPOINT, port=PORT, database=DBNAME, user=USR, password=''.join(PSSWD))
cur = conn.cursor()
cur.execute(query_status)
query_results = cur.fetchall()
df_status = query_results_to_df(query_results)
conn.close()

## 1.1 Filter

### 1.1.1 Images that are not well-shaped

In [4]:
df = df_status[df_status["cols"].isin({937, 938})]

In [5]:
df.shape

(644047, 9)

In [6]:
set_tracks = set(df.track_id)

# 2. Manually creation of the dataset

We have seen that pairing songs with a query can be very difficult, hence, we will design a pipeline to pair the triplets, two songs for the same artists with (two paired songs can be the same song - remind that there are different windows for each song)

## 3.1 List of selected artists

In [8]:
query_top_artists = """
MATCH (a1:Artist)-[:ART_TR]->(t1:Track)
WHERE t1.yt_views > 0
RETURN a1.artist_id, a1.artist_name
ORDER BY t1.yt_views DESC
LIMIT 100
"""
cursor_neo4j = graph.run(query_top_artists)
df_topart = pd.DataFrame.from_records(cursor_neo4j, columns=cursor_neo4j.keys())
top_artists = set(df_topart["a1.artist_id"])

## 3.2 Creation of dictionaries

### 3.2.1 Tracks for each artist

In [19]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

Create dictionaries to store the relationships of tracks per artist:


In [15]:
def query_results_to_df(query_results, cols = ["instance_id","stat","track_id","win","ini","fin","rows","cols","date"]):
    if len(query_results) == 0:
        return False
    return pd.DataFrame(query_results, columns=cols)

In [20]:
query_art_tr = """
SELECT * FROM rel_artist_track
"""
cols = ["artist_id", "track_id"]

cursor.execute(query_art_tr)
query_results = cursor.fetchall()
df_all_art_tr = query_results_to_df(query_results, cols = cols)
cursor.close()

# IMPOSE that the track id is in set_tracks (songs that we know that have been downloaded and with spectrogram)
df_all_art_tr = df_all_art_tr[df_all_art_tr["track_id"].isin(set_tracks)]

In [22]:
# Select only the top artists
df_top_art_tr = df_all_art_tr[df_all_art_tr["artist_id"].isin(top_artists)]

# Get the set of all artists
set_all_art = set(df_all_art_tr["artist_id"])

# Create a dictionary for all the tracks of top artist: a1
a1_tracks = defaultdict(set)
for i, row in df_top_art_tr.iterrows():
    a1_tracks[row.artist_id].add(row.track_id)

# Create a dictionary for all the tracks of all artist: a2
a2_tracks = defaultdict(set)
for i, row in tqdm.tqdm_notebook(df_all_art_tr.iterrows()):
    a2_tracks[row.artist_id].add(row.track_id)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### 3.2.2 Related artists

In [23]:
query_rel_art = """
SELECT DISTINCT query, rel_art FROM rel_artist_artist
"""
cols = ["query", "rel_art"]

In [24]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [25]:
cursor.execute(query_rel_art)
query_results = cursor.fetchall()
df_rel_art_art = query_results_to_df(query_results, cols = cols)
cursor.close()

In [26]:
df_rel_art_art.head()

Unnamed: 0,query,rel_art
0,66ko2UINS5X1TRK48kdsyh,5oNWzcU0mYK1zDUxBGHIaG
1,5xd2Tg7Zo8755eCy8Gxkp8,1dJyh390MvfYPuNbhnbSDs
2,4c5YsU2iQX3LvAKdPa0A8P,14bt7qAPOSPw76Fc1WFGM6
3,0gCGZZ1Ibo5QsOnll977PD,1TlOcOaTgKTolKfAUeMHgM
4,3SRes7eoE3xRodLopFKXWb,52wHLsSLdkVMLYo2ZXP8y6


In [27]:
# Create a dictionary to pair each artist with its related one
# Relationships are bidirectional
art_rels = defaultdict(set)
for i, row in tqdm.tqdm_notebook(df_rel_art_art.iterrows()):
    art_rels[row.query].add(row.rel_art)
    art_rels[row.rel_art].add(row.query)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### 3.2.3 Get the window and sizes for each track

Create a dictionary to store for each track which windows available, and the starting and ending time of the window, this will define a tuple (win, ini, fin) which will be ideal to know the jpg image of the spectogram to be downloaded from the S3 bucket

In [34]:
tracks_windows = df_status[["track_id", "win", "ini", "fin"]].drop_duplicates()

In [35]:
dict_tracks_windows = defaultdict(list)

for i, row in tqdm.tqdm_notebook(tracks_windows.iterrows()):
    dict_tracks_windows[row.track_id].append((row.win, row.ini, row.fin))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## 3.3 Pairing algorithm

In [86]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [39]:
list_top_artist = list(top_artists)

In [122]:
def create_triplet(a1):
    """
    Given an artist_id it uses the above created dictionaries to select:
    t1: track from a1
    t2: track from a1 (can be t1 = t2) since we will chose another window
    t3: track from a2
    
    This will be used as a triplet, being t1 the anchor, t2 the positive sample and t3 the negative one
    
    The idea is to learn that the embeddings for t1 vs. t2 should be much closer than the embeddings t1 vs. t3
    
    """
    
    
    # -------------------------------------------- #
    #     List all tracks for that artist a1
    # -------------------------------------------- #
    tr_a1 = a1_tracks[a1]
    tr_a1_list = list(tr_a1)

    # -------------------------------------------- #
    #        Track pairs for a1
    # -------------------------------------------- #
    
    # Make all possible pairs (admiting track repetition) of tracks (l1)
    l1_pairs = list(itertools.product(tr_a1_list, repeat=2))

    # -------------------------------------------- #
    #        Related artists
    # -------------------------------------------- #
    
    # Select all related artists
    if a1 in art_rels:
        rel_art_a1 = art_rels[a1]
    else:
        # --------------------------- #
        #  If has NOT related artists
        # --------------------------- #
        # Select 20 random artists different than a1
        rel_art_a1 = set()

        while len(rel_art_a1) < 20:
            
            # Get randomly a different artist than a1
            set_diff_art_than_a1 = set_all_art - set(a1)
            art_iter = random.sample(set_diff_art_than_a1, 1)[0]

            # Add it to the rel_art_a1 until we have 10 artists
            rel_art_a1.add(art_iter)

            
            
    # -------------------------------------------- #
    #      Loop over related artists
    # -------------------------------------------- #            
            
    # Get all the songs for each of the rel_art_a1
    rel_art_a1 = list(rel_art_a1)
    tr_a2_list = list()

    # For each related artist (a2) of a1
    for aa2 in rel_art_a1:
        
        # -------------------------------------------- #
        #    Tracks of a2
        # -------------------------------------------- #    

        # If we have tracks for that artist
        if aa2 in a2_tracks:

            # List such tracks
            set_aa2_tracks = a2_tracks[aa2]
            aa2_tracks = list(set_aa2_tracks)

            
             # ----------------------------------------------------------------------- #
            #   Avoid picking a feature song shared for a1 and a2 as the negative pair
            # ----------------------------------------------------------------------- #  
            # Select only songs that are not in the set of songs of a1 (tr_a1)
            sel_tracks_aa2 = set_aa2_tracks - tr_a1

            # Append them to the list
            tr_a2_list.extend(list(sel_tracks_aa2))

    # ----------------------------------------------------------------- #
    #  Coincide pairs of tracks of a1 length with tracks of rel_artists
    # ----------------------------------------------------------------- #    
    
    # Make each pair of l1_pairs to coincide with tracks for the related artists of a1 (tr_a2_list)
    len_l1_pairs = len(l1_pairs)
    len_tr_a2_list = len(tr_a2_list)

    # Create the negative list of tracks for rel_artist of the same length as the pairs of tracks
    negative_list = random.choices(tr_a2_list, k = len_l1_pairs)

    # pair all together
    triplets = tuple(zip(l1_pairs, negative_list))


    return triplets


def upload_tripl_window_combinations(tr, values = []):
    
    """
    Creates a sample of 100 possible combinations of different windows 
    for a given triplet tr (i.e (('6WMYFEd4MJDIjJARHnOxoN', '6WMYFEd4MJDIjJARHnOxoN'),
    '1TDwnS2MBA4jENbaqdgJWf'))
    """

    # Expand each triplet with 3 different windows
    tr1 = tr[0][0]
    tr2 = tr[0][1]
    tr3 = tr[1]

    # Create the list of windows for each track
    win_tr1 = dict_tracks_windows[tr1]
    win_tr2 = dict_tracks_windows[tr2]
    win_tr3 = dict_tracks_windows[tr3]

    # Pair them (make all possible windows combinations)
    all_windows = [win_tr1, win_tr2, win_tr3]
    all_comb = list(itertools.product(*all_windows))

    # Select 100 random pairs of windows for each triplet
    windows_triplets_selected = random.choices(all_comb, k= 100)

    # Upload to the triplets database

    for wts in windows_triplets_selected:

        # Ravel it into a list
        wts = np.ravel(wts)
        wts = list(wts)

        # Tuple (a1, tr1, win1, ini1, fin1, tr2, win2, ini2, ..., ini3, fin3)
        values.append((a1, tr1, int(wts[0]), int(wts[1]), int(wts[2]), tr2, int(wts[3]), int(wts[4]), int(wts[5]), 
                       tr3, int(wts[0]), int(wts[2]), int(wts[2])))

    return

In [None]:
for iia, a1 in tqdm.tqdm_notebook(enumerate(list_top_artist)):
    
    print(iia, "      ", a1)

    # Create all the possible triplets
    trip_a1 = create_triplet(a1)

    # Sample such triplets to have at most 500 triplets per artist
    if len(trip_a1) > 500:
        trip_a1 = random.sample(trip_a1, 500)

    # Create a empty list to store the values to keep uploading to the database
    values_db = [] 

    # make for each triplet at most 100 different windows combinations
    for tr in trip_a1:
        upload_tripl_window_combinations(tr, values_db)
        
    # ----------------------------------------------------------- #
    #    Upload all the windows combinations for that triplet
    # ----------------------------------------------------------- #    
    query = cursor.executemany("""insert into triplets VALUES 
                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING; 
                """ , values_db)
    conn.commit()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0        1TtXnWcUs0FCkaZDPGYHdf
1        1uNFoZAHBGtllmzznpCI3s
2        1i8SpTcr7yvPOmcqrbnVXY
3        5RLb16s3zfrdWdRF0l7xij
4        4RSyJzf7ef6Iu2rnLdabNq
5        64KEffDW9EtZ1y2vBYgq8T
6        738wLrAtLtCtFOLvQBXOXp
7        4obzFoKoKRHIphyHzJ35G3
8        06HL4z0CvFAxyc27GXpf02
9        5WUlDfRSoLAfcVSX1WnrxN


# Cypher query to check results

For:

```bash
art1 = '6S2OmqARrzebs0tKUEyXyp'
tr1 = '69W9wnBPbt38vDYNVSLRQW'
tr2 = '25n4NECujqOxQHdXVIVc9I'
tr3 = '3CidPLqWXqgesJlwYVJZdy'
```

Run:

```cypher
MATCH (t3:Track)<-[:ART_TR]-(a2:Artist)-[:REL_ART]-(a1:Artist)-[:ART_TR]->(t1:Track)
WHERE a1.artist_id = '6S2OmqARrzebs0tKUEyXyp' AND
      t1.track_id IN ['69W9wnBPbt38vDYNVSLRQW', '25n4NECujqOxQHdXVIVc9I'] AND
      t3.track_id = '3CidPLqWXqgesJlwYVJZdy'
RETURN a1, t1, t3, a2

```