## Connect to MongoDB, get tracks

In [7]:
from pymongo import MongoClient

# REPLACE CREDENTIALS
client = MongoClient("...")

# Get the database and tracks
db = client["tune-playlists"]
tracks_collection = db["Neo4J Staging Tracks"]

tracks = tracks_collection.find()

## Connect to Neo4j

In [7]:
from py2neo import Graph, Node, Relationship

# REPLACE CREDENTIALS
uri = "..."
username = "..."
password = "..."

graph = Graph(uri, auth=(username, password))

## Create node for each track

In [8]:
def upsert_track_nodes(tracks):
    for track in tracks:
        graph.run("""
            MERGE (t:Track {id: $id})
            ON CREATE SET
                t.genre = $genre,
                t.artist_ids = $artist_ids,
                t.album_id = $album_id,
                t.duration_ms = $duration_ms,
                t.explicit = $explicit,
                t.popularity = $popularity,
                t.danceability = $danceability,
                t.energy = $energy,
                t.key = $key,
                t.loudness = $loudness,
                t.mode = $mode,
                t.speechiness = $speechiness,
                t.acousticness = $acousticness,
                t.liveness = $liveness,
                t.valence = $valence,
                t.tempo = $tempo
            ON MATCH SET
                t.genre = $genre,
                t.artist_ids = $artist_ids,
                t.album_id = $album_id,
                t.duration_ms = $duration_ms,
                t.explicit = $explicit,
                t.popularity = $popularity,
                t.danceability = $danceability,
                t.energy = $energy,
                t.key = $key,
                t.loudness = $loudness,
                t.mode = $mode,
                t.speechiness = $speechiness,
                t.acousticness = $acousticness,
                t.liveness = $liveness,
                t.valence = $valence,
                t.tempo = $tempo
        """, id=track['id'],
            genre=track['genre'],
            artist_ids=track['artist_ids'],
            album_id=track['album_id'],
            duration_ms=track['duration_ms'],
            explicit=track['explicit'],
            popularity=track['popularity'],
            danceability=track['audio_features']['danceability'],
            energy=track['audio_features']['energy'],
            key=track['audio_features']['key'],
            loudness=track['audio_features']['loudness'],
            mode=track['audio_features']['mode'],
            speechiness=track['audio_features']['speechiness'],
            acousticness=track['audio_features']['acousticness'],
            liveness=track['audio_features']['liveness'],
            valence=track['audio_features']['valence'],
            tempo=track['audio_features']['tempo']
        )

upsert_track_nodes(tracks)

KeyError: 'genre'

## Connect to MongoDB, get playlists

In [6]:
playlists_collection = db["Neo4J Staging Playlists"]
playlists = playlists_collection.find()

## Create node for each playlist

In [8]:
def upsert_playlist_nodes(playlists):
    for playlist in playlists:
        graph.run("""
            MERGE (p:Playlist {playlist_id: $playlist_id})
            ON CREATE SET p.tracklist = $tracklist
            ON MATCH SET p.tracklist = $tracklist
            RETURN p
        """, tracklist = playlist["tracklist"],
            playlist_id = str(playlist['_id'])
        )       

upsert_playlist_nodes(playlists)

# Relationships

In [2]:
artist_query = """
MATCH (t1:Track), (t2:Track)
WHERE t1.id <> t2.id AND ANY(artist_id IN t1.artist_ids WHERE artist_id IN t2.artist_ids)
MERGE (t1)-[:SHARED_ARTIST]->(t2)
"""

album_query = """
MATCH (t1:Track), (t2:Track)
WHERE t1.id <> t2.id AND t1.album_id = t2.album_id
MERGE (t1)-[:SHARED_ALBUM]->(t2)
"""

genre_query = """
CALL apoc.periodic.iterate(
  "MATCH (t1:Track) RETURN t1",
  "MATCH (t2:Track) WHERE t1.id <> t2.id AND ANY(genre IN t1.genre WHERE genre IN t2.genre)
   MERGE (t1)-[:SHARED_GENRE]->(t2)",
  {batchSize: 5000, parallel: false}
)
"""

contains_query = """
MATCH (p:Playlist), (t:Track)
WHERE t.id IN p.tracklist
MERGE (p)-[:CONTAINS]->(t)
"""

In [None]:
graph.run(artist_query)

In [None]:
graph.run(album_query)

In [None]:
graph.run(genre_query)

In [None]:
graph.run(contains_query)

## Cosine Similarity

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

query = """
MATCH (t:Track)
RETURN t
"""

results = graph.run(query)
audio_attributes = ['acousticness', "danceability", "energy", "liveness", "loudness", "speechiness", "tempo", "valence"]

def get_audio_attributes(track_node):
    attributes = []
    for attribute_name in audio_attributes:
        attribute_value = track_node.get(attribute_name)
        attributes.append(attribute_value)
    return attributes
  
tracks = [record.get('t') for record in results]

# Normalize the audio attributes
scaler = StandardScaler()
X = np.array([get_audio_attributes(track) for track in tracks])
tracks_normalized = scaler.fit_transform(X)

for i in range(len(tracks)):
    for j in range(i+1, len(tracks)):
        if tracks[i] != tracks[j]: # and similarities[i][1] != tracks[i]
            cos_sim = cosine_similarity(np.array([tracks_normalized[i]]), np.array([tracks_normalized[j]]))[0][0]
            if(cos_sim > 0.975):
                graph.run(
                    """
                    MATCH (t1:Track {id: $track_id_1}), (t2:Track {id: $track_id_2})
                    WHERE NOT (t1)-[:COSINE_SIMILARITY]-(t2)
                    CREATE (t1)-[sim:COSINE_SIMILARITY {value: $cosine_similarity}]->(t2)
                    """,
                    track_id_1=tracks[i].get('id'),
                    track_id_2=tracks[j].get('id'),
                    cosine_similarity=float(cos_sim.item())
                )