In [1]:
#Loading Tracks and Playlists

from py2neo import Graph, Node, Relationship
import numpy as np
from collections import defaultdict

uri = "..."
username = "..."
password = "..."
graph = Graph(uri, auth=(username, password))

# Retrieve track nodes and their properties
track_query = '''
MATCH (t:Track)
RETURN t.acousticness, t.album_id, t.artist_ids, t.danceability, t.duration_ms, t.energy, t.explicit,
       t.genre, t.id, t.key, t.liveness, t.loudness, t.mode, t.popularity, t.speechiness, t.tempo, t.valence
'''

track_result = graph.run(track_query).data()

# Create dict for tracks
track_dict = {row['t.id']: {
    'index': idx,
    'acousticness': row['t.acousticness'],
    'album_id': row['t.album_id'],
    'artist_ids': row['t.artist_ids'],
    'danceability': row['t.danceability'],
    'duration_ms': row['t.duration_ms'],
    'energy': row['t.energy'],
    'explicit': row['t.explicit'],
    'genre': row['t.genre'],
    'liveness': row['t.liveness'],
    'loudness': row['t.loudness'],
    'popularity': row['t.popularity'],
    'speechiness': row['t.speechiness'],
    'tempo': row['t.tempo'],
    'valence': row['t.valence'],
} for idx, row in enumerate(track_result)}


# Retrieve playlist nodes and their properties
playlist_query = '''
MATCH (p:Playlist)-[:CONTAINS]->(t:Track)
WITH p.playlist_id AS playlist_id, collect(t.id) AS tracklist
RETURN playlist_id, tracklist
'''

playlist_result = graph.run(playlist_query).data()

# Create playlists dictionary
playlist_dict = {row['playlist_id']: {
    'index': idx,
    'track_ids': row['tracklist']
} for idx, row in enumerate(playlist_result)}

# Normalize Continuous Features

from sklearn.preprocessing import MinMaxScaler

continuous_features = ["acousticness", "danceability", "energy", "liveness", "loudness", "popularity", "speechiness", "tempo", "valence"]
scalers = {}

for feature in continuous_features:
    scaler = MinMaxScaler()
    feature_values = np.array([track[feature] for track in track_dict.values()]).reshape(-1, 1)
    scaler.fit(feature_values)
    scalers[feature] = scaler

    for track in track_dict.values():
        track[feature] = scaler.transform([[track[feature]]])[0, 0]

In [2]:
# Grabbing Relationship Data

n_playlists = len(playlist_dict)
n_tracks = len(track_dict)

# CONTAINS relationship (playlist-track)
contains_query = '''
MATCH (p:Playlist)-[:CONTAINS]->(t:Track)
RETURN p.playlist_id AS playlist_id, t.id AS track_id
'''
contains_result = graph.run(contains_query).data()

# Create CONTAINS list (Should be playlist_id and track_id pairs)
contains_list = [(record['playlist_id'], record['track_id']) for record in contains_result]

# SHARED_ALBUM, SHARED_ARTIST, and SHARED_GENRE lists (tuple array should be track1_id and track2_id)
album_query = '''
    MATCH (t1:Track)-[:SHARED_ALBUM]->(t2:Track)
    RETURN t1.id AS track1_id, t2.id AS track2_id
    '''
album_result = graph.run(album_query).data()
album_list = [(record['track1_id'], record['track2_id']) for record in album_result]

artist_query = '''
    MATCH (t1:Track)-[:SHARED_ARTIST]->(t2:Track)
    RETURN t1.id AS track1_id, t2.id AS track2_id
    '''
artist_result = graph.run(album_query).data()
artist_list = [(record['track1_id'], record['track2_id']) for record in artist_result]

genre_query = '''
    MATCH (t1:Track)-[:SHARED_GENRE]->(t2:Track)
    RETURN t1.id AS track1_id, t2.id AS track2_id
    '''
genre_result = graph.run(album_query).data()
genre_list = [(record['track1_id'], record['track2_id']) for record in genre_result]


# COSINE_SIMILARITY relationship (track-track)
# tuple array will be (track1_id, track2_id, similarity_value)
cosine_similarity_query = '''
MATCH (t1:Track)-[r:COSINE_SIMILARITY]->(t2:Track)
RETURN t1.id AS track1_id, t2.id AS track2_id, r.value AS similarity_value
'''
cosine_similarity_result = graph.run(cosine_similarity_query).data()
cosine_list = [(record['track1_id'], record['track2_id'], record['similarity_value']) for record in cosine_similarity_result]

In [3]:
track_similarity_matrix = np.zeros((n_tracks, n_tracks))

for track1_id, track2_id, similarity_value in cosine_list:
    track1_idx = track_dict[track1_id]['index']
    track2_idx = track_dict[track2_id]['index']
    track_similarity_matrix[track1_idx][track2_idx] = similarity_value*0.8
    track_similarity_matrix[track2_idx][track1_idx] = similarity_value*0.8
    
for track1_id, track2_id in album_list:
    track1_idx = track_dict[track1_id]['index']
    track2_idx = track_dict[track2_id]['index']
    track_similarity_matrix[track1_idx][track2_idx] += 0.50
    track_similarity_matrix[track2_idx][track1_idx] += 0.50
    
for track1_id, track2_id in artist_list:
    track1_idx = track_dict[track1_id]['index']
    track2_idx = track_dict[track2_id]['index']
    track_similarity_matrix[track1_idx][track2_idx] += 0.30
    track_similarity_matrix[track2_idx][track1_idx] += 0.30
    
for track1_id, track2_id in genre_list:
    track1_idx = track_dict[track1_id]['index']
    track2_idx = track_dict[track2_id]['index']
    track_similarity_matrix[track1_idx][track2_idx] += 0.75
    track_similarity_matrix[track2_idx][track1_idx] += 0.75

In [70]:
import heapq
from collections import defaultdict
from sklearn.model_selection import train_test_split
import random

def playlist_vector(playlist_id):
    track_ids = playlist_dict[playlist_id]['track_ids']
    return np.mean([track_similarity_matrix[track_dict[track_id]['index']] for track_id in track_ids], axis=0)

def find_similar_playlists(playlist_id, k=10):
    track_ids = playlist_dict[playlist_id]['track_ids']
    random.shuffle(track_ids)
    track_ids_train, track_ids_test = train_test_split(track_ids,test_size=0.5)
    train_playlist_vector = np.mean([track_similarity_matrix[track_dict[track_id]['index']] for track_id in track_ids_train], axis=0)
    test_playlist_vector = np.mean([track_similarity_matrix[track_dict[track_id]['index']] for track_id in track_ids_test], axis=0)
    
    similarities = []
    
    train_test_similarity = np.dot(train_playlist_vector, test_playlist_vector)
    similarities.append(("OTHER_HALF", train_test_similarity))
                        
    for other_playlist_id, other_playlist_data in playlist_dict.items():
        if other_playlist_id == playlist_id: # EDGE CASE THEY'RE THE SAME
            continue

        other_playlist_vector = playlist_vector(other_playlist_id)
        similarity = np.dot(train_playlist_vector, other_playlist_vector)
        similarities.append((other_playlist_id, similarity))

    similar_playlists = heapq.nlargest(k, similarities, key=lambda x: x[1])
    return similar_playlists, track_ids_train, track_ids_test

def cf_recommendation(playlist_id, k=10):
    similar_playlists, track_ids_MAIN, track_ids_OTHER_HALF = find_similar_playlists(playlist_id, k)
    track_scores = defaultdict(float)

    for playlist_id, similarity in similar_playlists:
        if playlist_id == "OTHER_HALF":
            for track_id in track_ids_OTHER_HALF:
                track_scores[track_id] += similarity
        else:
            for track_id in playlist_dict[playlist_id]['track_ids']:
                track_scores[track_id] += similarity

    sorted_scores = heapq.nlargest(k, track_scores.items(), key=lambda x: x[1])
    recommended_track_ids = [[track_id, score] for track_id, score in sorted_scores if track_id not in playlist_dict[playlist_id]['track_ids']]
    return recommended_track_ids, track_ids_MAIN, track_ids_OTHER_HALF

def cbf_recommendation(track_ids, track_similarity_matrix, k=10):
    #track_ids = playlist_dict[playlist_id]['track_ids']
    playlist_track_matrix = track_similarity_matrix[[track_dict[track_id]['index'] for track_id in track_ids]]
    mean_similarity = np.mean(playlist_track_matrix, axis=0)
    top_indices = heapq.nlargest(k+len(track_ids), range(len(mean_similarity)), key=lambda i: mean_similarity[i])
    recommended_indices = [i for i in top_indices if i not in [track_dict[track_id]['index'] for track_id in track_ids]][:k]
    recommended_tracks = [(track_result[i]['t.id'], mean_similarity[i]) for i in recommended_indices]
    return recommended_tracks

def hybrid_recommendation(playlist_id, track_similarity_matrix, k=10, cf_weight=0.5):
    cf_recommendations, track_ids_MAIN, track_ids_OTHER_HALF = cf_recommendation(playlist_id, k)
    cbf_recommendations = cbf_recommendation(track_ids_MAIN, track_similarity_matrix, k)

    track_scores = defaultdict(float)
    for track_id, score in cf_recommendations:
        track_scores[track_id] += cf_weight * score
    for track_id, score in cbf_recommendations:
        track_scores[track_id] += (1 - cf_weight) * score

    sorted_scores = heapq.nlargest(k, track_scores.items(), key=lambda x: x[1])
    recommended_track_ids = [track_id for track_id, score in sorted_scores]
    return recommended_track_ids, track_ids_OTHER_HALF

def test(sample):
    percisionKs = []
    recallKs = []
    for i in range(0, sample):
        target_playlist_id = random.choice(list(playlist_dict.items()))[0]
        recommended_tracks, relavant_tracks = hybrid_recommendation(target_playlist_id, track_similarity_matrix, k=50, cf_weight=0.3)
        count = 0
        for recommendation in recommended_tracks:
            if recommendation in relavant_tracks:
                count += 1
        percisionK = count / len(recommended_tracks)
        recallK = count / len(relavant_tracks)
        percisionKs.append(percisionK)
        recallKs.append(recallK)
    avg_percisionK = sum(percisionKs) / len(percisionKs)
    avg_recallK = sum(recallKs) / len(recallKs)
    return avg_percisionK, avg_recallK, percisionKs, recallKs

avg_percisionK, avg_recallK, percisionKs, recallKs = test(100) # <= The number of playlists to sample
print("Percision:", str(avg_percisionK))
print("Recall:", str(avg_recallK))

Percision: 0.18179999999999993
Recall: 0.1819999999999999


In [71]:
# precision@k
precision50 = 0.1911999999999999
precision50 = 0.22959999999999994
precision50 = 0.2483999999999999
precision100 = 0.14579999999999996
precision100 = 0.13529999999999995
precision100 = 0.1595999999999999
precision200 = 0.11895000000000001
precision200 = 0.12045000000000003
precision200 = 0.10065000000000002

# recall@k
recall50 = 0.18755555555555553
recall50 = 0.20679999999999996
recall50 = 0.26199999999999996
recall100 = 0.3552818713450292
recall100 = 0.3183999999999999
recall100 = 0.24699999999999991
recall200 = 0.4711999999999999
recall200 = 0.41139999999999977
recall200 = 0.4071263157894736