In [29]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import h5py
import tables
import matplotlib.pyplot as plt
import hdf5_getters
import glob
import time
from collections import defaultdict
import math

In [31]:
base_path = "C:/Users/mason/Downloads/millionsongsubset/MillionSongSubset"

In [27]:
def get_song_data_from_file(file_path):
    """ Extracts the required song data from a single HDF5 file. """
    try:
        # Open the HDF5 file
        h5_file = hdf5_getters.open_h5_file_read(file_path)
        
        # Extract the number of songs in the file
        song_num = hdf5_getters.get_num_songs(h5_file)
        
        song_data_list = []  # List to store the data for all songs
        
        for i in range(song_num):
            # Extract the required features for each song using the index
            song_id = hdf5_getters.get_song_id(h5_file, i)
            similar_artists = hdf5_getters.get_similar_artists(h5_file, i)
            title = hdf5_getters.get_title(h5_file, i)
            hotness = hdf5_getters.get_song_hotttnesss(h5_file, i)
            duration = hdf5_getters.get_duration(h5_file, i)
            tempo = hdf5_getters.get_tempo(h5_file, i)
            track_id = hdf5_getters.get_track_id(h5_file, i)
            artist = hdf5_getters.get_artist_id(h5_file, i)
            
            # Append the extracted data for the current song to the list
            song_data = {
                'song_id': song_id,
                'similar_artists': similar_artists,
                'title': title,
                'hotness': hotness,
                'duration': duration,
                'tempo': tempo,
                'track_id': track_id,
                'artist': artist
            }
            song_data_list.append(song_data)
        
        # Close the file after extraction
        h5_file.close()
        
        # Return the list of song data dictionaries
        return song_data_list

    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None
    
def get_all_files(basedir,ext='.h5') :
    """
    From a root directory, go through all subdirectories
    and find all files with the given extension.
    Return all absolute paths in a list.
    """
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files :
            allfiles.append( os.path.abspath(f) )
    return allfiles

def construct_dataframe_from_directory(directory):
    """ Construct a DataFrame by iterating through all the HDF5 files in the given directory. """
    song_data = []
    
    songs = get_all_files(directory)  # Get the list of songs in the directory
    
    for song in songs:
        song_data_from_file = get_song_data_from_file(song)
        
        if song_data_from_file:
            # Flatten the list of song data into the main list
            song_data.extend(song_data_from_file)
    
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(song_data)
    return df

In [None]:
d = construct_dataframe_from_directory(base_path)
print(d.head())


In [None]:
def construct_graph_from_dataframe(df, min_songs_per_artist=2):
    """ Construct a graph from the song data stored in a DataFrame, with a threshold for connections. """
    graph = defaultdict(set)  # Each song will have a set of connected songs
    
    # Iterate through each song in the dataframe
    for _, row in df.iterrows():
        song_id = row['song_id']
        artist_id = row['artist']
        
        # Find other songs by the same artist
        same_artist_songs = df[df['artist'] == artist_id]
        
        # Only create edges if there are at least 'min_songs_per_artist' songs by the same artist
        if len(same_artist_songs) >= min_songs_per_artist:
            for _, other_row in same_artist_songs.iterrows():
                other_song_id = other_row['song_id']
                
                # Avoid connecting a song to itself
                if song_id != other_song_id:
                    graph[song_id].add(other_song_id)
                    graph[other_song_id].add(song_id)
    
    return graph

In [None]:
graph = construct_graph_from_dataframe(d, min_songs_per_artist=4)

In [None]:
def adamic_adar_similarity(song_id_1, song_id_2, graph):
    common_neighbors = graph.get(song_id_1, set()).intersection(graph.get(song_id_2, set()))
    
    if not common_neighbors:
        return 0
    
    score = 0
    for neighbor in common_neighbors:
        if neighbor in graph:
            degree = len(graph[neighbor])
            if degree > 1:
                score += 1 / math.log(degree)
    return score

# Step 3: Generate recommendations for a user based on liked songs
MIN_SIMILARITY_SCORE = 0.1

def recommend_songs(user_liked_songs, graph, top_n=5, min_similarity=MIN_SIMILARITY_SCORE):
    similarity_scores = defaultdict(float)
    
    for liked_song in user_liked_songs:
        for other_song in list(graph.keys()):
            if other_song not in user_liked_songs:  # Exclude already liked songs
                similarity_score = adamic_adar_similarity(liked_song, other_song, graph)
                
                # Only add scores that are above the minimum threshold
                if similarity_score > min_similarity:
                    similarity_scores[other_song] += similarity_score
    
    # Sort and return the top N recommended songs
    recommended_songs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
    return recommended_songs[:top_n]

In [None]:
user_liked_songs = [b'SOMZWCG12A8C13C480', b'SOXVLOJ12AB0189215']
recommended_songs = recommend_songs(user_liked_songs, graph)
print("Recommended Songs:", recommended_songs)

In [None]:
print(user_liked_songs)


In [None]:
similarity_scores = {}

# Create a list of all songs to avoid modifying the graph during iteration
for liked_song, _ in recommended_songs:
    for other_song in list(graph.keys()):  # Use list(graph.keys()) to avoid modifying the graph during iteration
        if other_song not in user_liked_songs:  # Exclude already liked songs
            similarity_score = adamic_adar_similarity(liked_song, other_song, graph)
            similarity_scores[other_song] = similarity_score
            
            
for song_id, score in similarity_scores.items():
    if (score != 0):
        print(f"Song ID: {song_id.decode('utf-8')} | Similarity Score: {score}")        


In [None]:
def print_top_recommended_songs(similarity_scores, top_n=3):
    # Sort the similarity scores in descending order
    sorted_scores = sorted(similarity_scores.items(), key=lambda item: item[1], reverse=True)
    
    # Get the top N songs (top 3 by default)
    top_songs = sorted_scores[:top_n]
    
    # Print the songs with their titles
    for song_id, score in top_songs:
        print_songs_from_bytehash(song_id)
        print(f"Similarity Score: {score}")

In [None]:
import pandas as pd

# Load the uploaded songs_metadata.csv to examine its structure
file_path = 'C:/Users/mason/OneDrive/Desktop/graph_analysis/FINAL_PROJECT/PULL/Music-Recommendation/songs_metadata.csv'
songs_metadata = pd.read_csv(file_path)

# Display the first few rows and basic info of the dataset
songs_metadata.info(), songs_metadata.head()


In [None]:
import networkx as nx
import matplotlib.pyplot as plt

import pandas as pd

# Load the uploaded songs_metadata.csv to examine its structure
file_path = 'C:/Users/mason/OneDrive/Desktop/graph_analysis/FINAL_PROJECT/PULL/Music-Recommendation/songs_metadata.csv'
songs_metadata = pd.read_csv(file_path)

# Preprocess similar_artists column (remove unwanted characters)
songs_metadata['similar_artists'] = songs_metadata['similar_artists'].str.strip("[]").str.replace("b'", "").str.replace("'", "")
songs_metadata['similar_artists'] = songs_metadata['similar_artists'].str.split(", ")

# Initialize the graph
G = nx.Graph()

# Add nodes and edges
for _, row in songs_metadata.iterrows():
    song_id = row['song_id']
    song_title = row['song_title']
    similar_artists = row['similar_artists']

    # Add the song node
    G.add_node(song_id, label=song_title)

    # Add edges to similar artists (if any)
    if isinstance(similar_artists, list):
        for similar in similar_artists:
            similar = similar.strip()
            if similar:  # Avoid empty entries
                G.add_edge(song_id, similar)

# Visualize the graph (sample 50 nodes for simplicity)
plt.figure(figsize=(12, 8))
subgraph = G.subgraph(list(G.nodes)[:2000])  # Sampling first 50 nodes
pos = nx.spring_layout(subgraph, seed=42)
nx.draw(subgraph, pos, with_labels=True, node_size=300, node_color='cyan', font_size=1, font_color="black")
plt.title("Song Recommendation Graph (Sampled)")
plt.show()


In [None]:
print_top_recommended_songs(similarity_scores, top_n=3)
