In [2]:
import csv
import numpy as np # if not installed use "pip iinstall numpy"

## load_dataset_module.py

In [5]:
# list of features to retrive from the dataset
features_to_retrieve = ['acousticness', 'artists', 'danceability', 'energy', 'id', 'liveness',
                            'loudness', 'name', 'popularity', 'speechiness', 'tempo','valence']


In [10]:
def load_dataset():
    """
    This fuction loads the dataset.csv and stores it in a dictionary,
    the dataset column names and its values being used as key-value pair in dictionary
    return:
        a dictionary format of the dataset
    """
    def get_mulList(*args):
        return map(list, zip(*args))

    csv_data = open("data.csv", encoding='utf-8')
    data = list(csv.reader(csv_data))
    data_dict = dict(zip(data[0], get_mulList(*data[1:])))
    for key in list(data_dict.keys()):
        if key not in features_to_retrieve:
            del data_dict[key]
    return data_dict


In [7]:
# load the dataset
data_dict = load_dataset()

In [12]:
def artist_music_func():
    """
    This function returns artist music dictionary contains artists name, music name, and corresponding features
    """
    artist_music_dict = {}
    artist_music_features = ['artists', 'name', 'id', 'acousticness', 'danceability', 'energy', 'liveness',
                             'loudness', 'popularity', 'speechiness', 'tempo', 'valence']
    for key, value in data_dict.items():
        if key in artist_music_features:
            artist_music_dict[key] = value
    return artist_music_dict

In [13]:
def music_features_func():
    """
    This fuction returns  music_features dictionary contains music id, and their respective features.
    """
    music_features_dict = {}
    music_features = ['id', 'acousticness', 'danceability', 'energy', 'liveness',
                             'loudness', 'popularity', 'speechiness', 'tempo', 'valence']
    for key, value in data_dict.items():
        if key in music_features:
            music_features_dict[key] = value
    return music_features_dict

## similarity_module.py

import numpy as np

from load_dataset_module import artist_music_func, music_features_func

In [1]:
def helper_func(data_dict):
    """
    This function helps modified the data_dict
    such that the id which is unique for each entry of the dataset is now set as the keys of the dictionary,
    while the corresponding features are now values of the dictionary

    return:
            a dictionary where the id is the key and others set as value.
    """
    numeric_features = ['acousticness', 'danceability', 'energy', 'liveness',
                        'loudness', 'popularity', 'speechiness', 'tempo', 'valence']
    numerical_feature_values = []
    for key in data_dict.keys():
        if key in numeric_features:
            numerical_feature_values.append(data_dict[key])
    numerical_feature_values = np.array(numerical_feature_values, dtype=float).T    
    id_and_numeric_features_dict = dict(zip(data_dict['id'], numerical_feature_values))
    return id_and_numeric_features_dict


In [2]:

def euclidean_similarity(data_dict, id_1, id_2):
    """
    Compute Eculidean similarity between two ids using their numerical values
    and returns the result
    """
    id_value_1 = data_dict[id_1]
    id_value_2 = data_dict[id_2]
    euc_sim = np.linalg.norm(id_value_1 - id_value_2)
    return euc_sim


def cosine_similarity(data_dict, id_1, id_2):
    """
    Compute Cosine similarity between two ids using their numerical values
    and returns the result
    """
    id_value_1 = data_dict[id_1]
    id_value_2 = data_dict[id_2]
    cos_sim = np.dot(id_value_1, id_value_2) / (np.linalg.norm(id_value_1) * np.linalg.norm(id_value_2))
    return cos_sim

def pearson_similarity(data_dict, id_1, id_2):
    """
    Compute Pearson similarity between two ids using their numerical values
    and returns the result
    """
    id_value_1 = data_dict[id_1]
    id_value_2 = data_dict[id_2]
    ps_sim = np.corrcoef(id_value_1, id_value_2)[0, 1]
    return ps_sim

def jaccard_similarity(data_dict, id_1, id_2):
    """
    Compute Jaccard similarity between two ids using their numerical values
    and returns the result
    """
    id_value_1 = data_dict[id_1]
    id_value_2 = data_dict[id_2]
    intersection = len(list(set(id_value_1).intersection(id_value_2)))
    union = (len(id_value_1) + len(id_value_2)) - intersection
    return float(intersection) / union

def manhattan_similarity(data_dict, id_1, id_2):
    """
    Compute Manhattan similarity between two ids using their numerical values
    and returns the result
    """
    id_value_1 = data_dict[id_1]
    id_value_2 = data_dict[id_2]
    mah_sim = np.abs(id_value_1 - id_value_2).sum()
    return mah_sim


In [None]:
artist_music_dict = artist_music_func()
music_features_dict = music_features_func()

In [17]:
def compute_similarity(id_1, id_2, similarity_func):
    """
    This functions take two ids and the a similarity metric
    then carries out mathematical computation using the numerical values of the ids
    using the similarity metric pass and return the result.
    """
    ids_as_keys_dict = helper_func(music_features_dict)
    list_of_ids = ids_as_keys_dict.keys()
    if (id_1 in list_of_ids) and (id_2 in list_of_ids):
        result = similarity_func(ids_as_keys_dict, id_1, id_2)
        return result
    else:
        print("Ids not found")

###### The main function the implementaion takes place 

In [None]:
def main():
    id1 = input("Enter the first music_id or artist_id: ")
    id2 = input("Enter the second music_id or artist_id: ")
    metric_choice = input("What similarity metric to use?\n\
                          Enter 1 for euclidean_similarity\n\
                          Enter 2 for cosine_similarity\n\
                          Enter 3 for pearson_similarity\n\
                          Enter 4 for jaccard_similarity\n\
                          Enter 5 for manhattan_similarity\n")

    metric_dict = {"1":euclidean_similarity, "2":cosine_similarity, 
                    "3": pearson_similarity, "4":jaccard_similarity, "5":manhattan_similarity}
    if metric_choice in metric_dict.keys():
        final_value = compute_similarity(id1, id2, metric_dict[metric_choice])
        if final_value:
            display_sim = ['##','euclidean_similarity','cosine_similarity', 'pearson_similarity', 'jaccard_similarity', 'manhattan_similarity']
            print("The {} between music_id: {} and music_id: {} is {}".format(display_sim[int(metric_choice)], id1, id2, final_value))
            
if __name__ == "__main__":
    main()