In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import euclidean
from collections import Counter
from pathlib import Path

In [2]:
def get_most_similar_tracks(df_track, df_users, user_id, top_n=10):
    # 1. Filter alle interacties van de gebruiker
    user_data = df_users[df_users['user_id'] == user_id]
    
    # 2. Bereken de gemiddelde waarde voor de geselecteerde audiofeatures
    audio_features = ['danceability', 'tempo', 'acousticness', 'instrumentalness', 'liveness', 'speechiness', 'loudness']
    scaler = MinMaxScaler()
    user_scaled = scaler.fit_transform(user_data[audio_features])
    avg_user_features = np.mean(user_scaled, axis=0)
    
    # 3. Bepaal het meest geluisterde genre en artiest
    most_common_genre = Counter(user_data['track_genre']).most_common(1)[0][0]
    most_common_artist = Counter(user_data['artists']).most_common(1)[0][0]
    
    # 4. Normaliseer de audio features van df_track
    track_scaled = scaler.transform(df_track[audio_features])
    
    # 5. Bereken de afstand tot elke track
    distances = []
    for i, row in df_track.iterrows():
        track_features = track_scaled[i]
        audio_distance = euclidean(avg_user_features, track_features) * 0.6
        genre_distance = 0 if row['track_genre'] == most_common_genre else 1
        artist_distance = 0 if row['artists'] == most_common_artist else 1
        total_distance = audio_distance + genre_distance * 0.3 + artist_distance * 0.1
        distances.append(total_distance)
    
    # 6. Voeg de afstand toe aan de dataframe en sorteer op gelijkenis
    df_track['similarity_score'] = distances
    similar_tracks = df_track.sort_values(by='similarity_score', ascending=True).head(top_n)
    
    return similar_tracks

In [3]:
df_tracks = pd.read_csv(r'C:\Documenten\ADS\PPM\Assignment2\Git\PPM\streamlit_app\data\spotify_tracks_clean.csv')
#Random selection of 1000
df_tracks_sample = df_tracks.sample(1000)
df_tracks_sample = df_tracks_sample.dropna(subset=['danceability', 'tempo', 'acousticness', 'instrumentalness', 'liveness', 'speechiness', 'loudness', 'track_genre', 'artists'])
df_tracks_sample = df_tracks_sample.reset_index(drop=True)

In [4]:
df_users = pd.read_csv(r'C:\Documenten\ADS\PPM\Assignment2\Git\PPM\streamlit_app\data\synthetic_user_data.csv')
df_users = df_users.dropna(subset=['danceability', 'tempo', 'acousticness', 'instrumentalness', 'liveness', 'speechiness', 'loudness', 'track_genre', 'artists'])
df_users = df_users.reset_index(drop=True)

In [5]:
get_most_similar_tracks(df_tracks, df_users,77, top_n=10)

Unnamed: 0,track_id,track_name,album_name,artists,duration_ms,popularity,track_genre,explicit,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,album_cover,popularity_genre,tracks_played,similarity_score
61736,0HXnNR8t6ckKrZPQxtUdeV,Sailing,Session Acoustique,Rod Stewart,277960,2,rock,False,0.43,0.375,...,0.374,0.376,0.316,0.189,128.791,4,https://i.scdn.co/image/ab67616d00001e02f55dd1...,2,1,0.511024
26206,0JS0AEmlz20lugs6G10mYy,From the Hard - Original Mix,D-Block & S-te-Fan - From The Hard EP,D-Block & S-te-Fan,257600,22,hardstyle,False,0.441,0.899,...,0.287,0.302,0.29,0.275,149.965,4,https://i.scdn.co/image/ab67616d00001e027fbe0c...,31,271,0.550022
2830,5EotiyD8MUFtMSwBHxkI8W,Ain't No Bread in the Bread Box,The Leap Year Sessions: Volume One,Greensky Bluegrass,557400,22,bluegrass,False,0.418,0.673,...,0.247,0.261,0.169,0.339,138.591,4,https://i.scdn.co/image/ab67616d00001e02482764...,32,340,0.554168
59819,5D4yWRy2TXUujlb0KuxujZ,Reuben,Foggy Mountain Banjo,Flatt & Scruggs,120320,27,bluegrass,False,0.404,0.685,...,0.157,0.35,0.264,0.909,134.839,4,https://i.scdn.co/image/ab67616d00001e022c674b...,39,432,0.555468
15205,7nnjBAQsUg0f3Gcn2lvEt0,Covered Bridges,Muir Maid,Kitchen Dwellers,484573,23,bluegrass,False,0.376,0.701,...,0.274,0.481,0.281,0.72,154.763,4,https://i.scdn.co/image/ab67616d00001e026be42f...,33,342,0.562202
47858,0TZejo18HlJ86OrWNsXKnw,Mrs Magic,Mrs Magic,Strawberry Guy,208500,77,indie-pop,False,0.471,0.634,...,0.159,0.365,0.374,0.366,132.542,4,https://i.scdn.co/image/ab67616d00001e02469512...,88,2202,0.564556
62028,1y3JhKhE6ZuUN8PpTXP0k6,Sanson Ki Mala - Radio Edit,Sanson Ki Mala (Radio Edit),Farasat Anees;Toshi,270000,38,electronic,False,0.393,0.365,...,0.251,0.334,0.433,0.206,139.786,3,https://i.scdn.co/image/ab67616d00001e02555fb5...,54,1039,0.565839
11816,0Gtez9XpmJFqUgOaWbmPgv,Car Wreck,Do Wrong Right,The Devil Makes Three,425506,24,bluegrass,False,0.353,0.403,...,0.401,0.308,0.236,0.326,140.744,4,https://i.scdn.co/image/ab67616d00001e02872465...,35,401,0.567368
87118,5CoYuqXOp6ls2SzSsHCBM4,八點零五分,跟你開玩笑,my little airport,152812,44,cantopop,False,0.449,0.439,...,0.338,0.441,0.238,0.144,150.257,4,https://i.scdn.co/image/ab67616d00001e02857aa8...,61,990,0.581765
50802,5IjWQx3jZvQyRw0PNKDUCX,Northern Island,Rare Bird Alert,Steve Martin;Steep Canyon Rangers,159840,23,bluegrass,False,0.444,0.77,...,0.368,0.536,0.158,0.973,150.138,4,https://i.scdn.co/image/ab67616d00001e024075f2...,33,387,0.585721
