In [25]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import h5py
import tables
import hdf5_getters
import glob
import time
from collections import defaultdict
import math

In [5]:
base_path = "C:/Users/brade/Downloads/millionsongsubset/MillionSongSubset"

In [65]:
def get_song_data_from_file(file_path):
    """ Extracts the required song data from a single HDF5 file. """
    try:
        # Open the HDF5 file
        h5_file = hdf5_getters.open_h5_file_read(file_path)
        
        # Extract the number of songs in the file
        song_num = hdf5_getters.get_num_songs(h5_file)
        
        song_data_list = []  # List to store the data for all songs
        
        for i in range(song_num):
            # Extract the required features for each song using the index
            song_id = hdf5_getters.get_song_id(h5_file, i)
            similar_artists = hdf5_getters.get_similar_artists(h5_file, i)
            title = hdf5_getters.get_title(h5_file, i)
            hotness = hdf5_getters.get_song_hotttnesss(h5_file, i)
            duration = hdf5_getters.get_duration(h5_file, i)
            tempo = hdf5_getters.get_tempo(h5_file, i)
            track_id = hdf5_getters.get_track_id(h5_file, i)
            artist = hdf5_getters.get_artist_id(h5_file, i)
            
            # Append the extracted data for the current song to the list
            song_data = {
                'song_id': song_id,
                'similar_artists': similar_artists,
                'title': title,
                'hotness': hotness,
                'duration': duration,
                'tempo': tempo,
                'track_id': track_id,
                'artist': artist
            }
            song_data_list.append(song_data)
        
        # Close the file after extraction
        h5_file.close()
        
        # Return the list of song data dictionaries
        return song_data_list

    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None
    
def get_all_files(basedir,ext='.h5') :
    """
    From a root directory, go through all subdirectories
    and find all files with the given extension.
    Return all absolute paths in a list.
    """
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files :
            allfiles.append( os.path.abspath(f) )
    return allfiles

def construct_dataframe_from_directory(directory):
    """ Construct a DataFrame by iterating through all the HDF5 files in the given directory. """
    song_data = []
    
    songs = get_all_files(directory)  # Get the list of songs in the directory
    
    for song in songs:
        song_data_from_file = get_song_data_from_file(song)
        
        if song_data_from_file:
            # Flatten the list of song data into the main list
            song_data.extend(song_data_from_file)
    
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(song_data)
    return df

In [66]:
d = construct_dataframe_from_directory(base_path)
print(d.head())


                 song_id                                    similar_artists  \
0  b'SOMZWCG12A8C13C480'  [b'ARV4KO21187FB38008', b'ARWHM281187FB3D381',...   
1  b'SOCIWDW12A8C13D406'  [b'ARSZWK21187B9B26D7', b'ARLDW2Y1187B9B544F',...   
2  b'SOXVLOJ12AB0189215'  [b'ARFSJUG11C8A421AAD', b'AR8SD041187FB36015',...   
3  b'SONHOTT12A8C13493C'  [b'AR4R0741187FB39AF2', b'AR0D7K21187B9AD14E',...   
4  b'SOFSOCN12A8C143F5D'  [b'ARUA62A1187B99D9B0', b'ARHJFFY1187B98BA76',...   

                 title   hotness   duration    tempo               track_id  \
0  b"I Didn't Mean To"  0.602120  218.93179   92.198  b'TRAAAAW128F429D538'   
1         b'Soul Deep'       NaN  148.03546  121.274  b'TRAAABD128F429CF47'   
2   b'Amor De Cabaret'       NaN  177.47546  100.070  b'TRAAADZ128F9348C2E'   
3   b'Something Girls'       NaN  233.40363  119.293  b'TRAAAEF128F4273421'   
4    b'Face the Ashes'  0.604501  209.60608  129.738  b'TRAAAFD128F92F423A'   

                  artist  
0  b'ARD7TVE1187B99BFB1

In [74]:
def construct_graph_from_dataframe(df, min_songs_per_artist=2):
    """ Construct a graph from the song data stored in a DataFrame, with a threshold for connections. """
    graph = defaultdict(set)  # Each song will have a set of connected songs
    
    # Iterate through each song in the dataframe
    for _, row in df.iterrows():
        song_id = row['song_id']
        artist_id = row['artist']
        
        # Find other songs by the same artist
        same_artist_songs = df[df['artist'] == artist_id]
        
        # Only create edges if there are at least 'min_songs_per_artist' songs by the same artist
        if len(same_artist_songs) >= min_songs_per_artist:
            for _, other_row in same_artist_songs.iterrows():
                other_song_id = other_row['song_id']
                
                # Avoid connecting a song to itself
                if song_id != other_song_id:
                    graph[song_id].add(other_song_id)
                    graph[other_song_id].add(song_id)
    
    return graph

In [78]:
graph = construct_graph_from_dataframe(d, min_songs_per_artist=4)

In [84]:
def adamic_adar_similarity(song_id_1, song_id_2, graph):
    common_neighbors = graph.get(song_id_1, set()).intersection(graph.get(song_id_2, set()))
    
    if not common_neighbors:
        return 0
    
    score = 0
    for neighbor in common_neighbors:
        if neighbor in graph:
            degree = len(graph[neighbor])
            if degree > 1:
                score += 1 / math.log(degree)
    return score

# Step 3: Generate recommendations for a user based on liked songs
MIN_SIMILARITY_SCORE = 0.1

def recommend_songs(user_liked_songs, graph, top_n=5, min_similarity=MIN_SIMILARITY_SCORE):
    similarity_scores = defaultdict(float)
    
    for liked_song in user_liked_songs:
        for other_song in list(graph.keys()):
            if other_song not in user_liked_songs:  # Exclude already liked songs
                similarity_score = adamic_adar_similarity(liked_song, other_song, graph)
                
                # Only add scores that are above the minimum threshold
                if similarity_score > min_similarity:
                    similarity_scores[other_song] += similarity_score
    
    # Sort and return the top N recommended songs
    recommended_songs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
    return recommended_songs[:top_n]

In [85]:
user_liked_songs = [b'SOMZWCG12A8C13C480', b'SOXVLOJ12AB0189215']
recommended_songs = recommend_songs(user_liked_songs, graph)
print("Recommended Songs:", recommended_songs)

Recommended Songs: [(b'SOQLGFP12A58A7800E', 3.083390054218504), (b'SODKPKO12A58A79DF7', 3.083390054218504), (b'SOZWTCR12A58A7BA31', 3.083390054218504), (b'SOXPJJT12AB01843F5', 3.083390054218504), (b'SOCTKZS12A58A7C5E6', 3.083390054218504)]


In [99]:
print(user_liked_songs)


[b'SOMZWCG12A8C13C480', b'SOXVLOJ12AB0189215']


In [86]:
similarity_scores = {}

# Create a list of all songs to avoid modifying the graph during iteration
for liked_song, _ in recommended_songs:
    for other_song in list(graph.keys()):  # Use list(graph.keys()) to avoid modifying the graph during iteration
        if other_song not in user_liked_songs:  # Exclude already liked songs
            similarity_score = adamic_adar_similarity(liked_song, other_song, graph)
            similarity_scores[other_song] = similarity_score
            
            
for song_id, score in similarity_scores.items():
    if (score != 0):
        print(f"Song ID: {song_id.decode('utf-8')} | Similarity Score: {score}")        


Song ID: SOQLGFP12A58A7800E | Similarity Score: 3.083390054218504
Song ID: SODKPKO12A58A79DF7 | Similarity Score: 3.083390054218504
Song ID: SOZWTCR12A58A7BA31 | Similarity Score: 3.083390054218504
Song ID: SOXPJJT12AB01843F5 | Similarity Score: 3.083390054218504
Song ID: SOCTKZS12A58A7C5E6 | Similarity Score: 3.5972883965882545
Song ID: SOAPMKZ12A58A764B7 | Similarity Score: 3.083390054218504
Song ID: SOIYNOF12AB0182141 | Similarity Score: 3.083390054218504


In [101]:
def print_top_recommended_songs(similarity_scores, top_n=3):
    # Sort the similarity scores in descending order
    sorted_scores = sorted(similarity_scores.items(), key=lambda item: item[1], reverse=True)
    
    # Get the top N songs (top 3 by default)
    top_songs = sorted_scores[:top_n]
    
    # Print the songs with their titles
    for song_id, score in top_songs:
        print_songs_from_bytehash(song_id)
        print(f"Similarity Score: {score}")

In [102]:
print_top_recommended_songs(similarity_scores, top_n=3)


Song Title: b'Let Me'
6
Similarity Score: 3.5972883965882545
Song Title: b'OAKtown'
7
Similarity Score: 3.083390054218504
Song Title: b'I believe'
9
Similarity Score: 3.083390054218504
