In [1]:
import sys
sys.path.append("../")
from utils import *

In [2]:
# Load in the data
d = {'pl':pd.read_csv(data_path+'pl_tracks.csv'),\
     't':pd.read_csv(data_path+'tracks.csv')}

# Calculate popularity: number of occurances
pop = d['pl'].groupby('track_uri').count().reset_index()
pop.columns = ['track_uri', 'popularity']
df = pd.merge(d['t'], pop, on = 'track_uri', how = 'left')

# Define the list of track_uri ordered by popularity
pop_sorted_tracks = df.sort_values('popularity', ascending = False)['track_uri'].values.tolist()
print(len(pop_sorted_tracks))
pop_sorted_tracks = pop_sorted_tracks[:(len(pop_sorted_tracks)//100)]
print(len(pop_sorted_tracks))

2262292
22622


In [3]:
del d, df, pop
gc.collect()

63

In [4]:
with open(data_path + "train_test_data.pkl", "rb") as f:
    X_train, y_train, X_test, y_test = pickle.load(f)

# Define Similarity Metrics

In [5]:
def overlap_similarity(pl1, pl2):
    ''' We define the similarity between playlists to be:
        # overlap songs / min(len(pl1), len(pl2))'''
    overlap = sum([a in pl2 for a in pl1])
    return overlap / min(len(pl1), len(pl2))

In [6]:
# Define X_all and y_all
X_all = X_train.copy()
for k,v in X_test.items(): X_all[k] = v
y_all = y_train.copy()
for k,v in y_test.items(): y_all[k] = v
print(len(X_all))
print(len(y_all))

9998
9998


In [7]:
del X_train, y_train, X_test, y_test
gc.collect()

0

# Train KNN Ranking Model

In [9]:
def get_neighbor_dict(X, n_neighbors = 100):
    ''' Use the input X to find the n_neighbors neighbors for each playlist using 
        overlap_similarity function. '''
    all_keys = X.keys()
    neighbor_dict = {k:[] for k in all_keys}
    neighbor_sim_dict = {k:[] for k in all_keys}
    n_loop = len(all_keys)
    for i in range(n_loop):
        if i % (n_loop//10) == 0: print(i)
        this_pl = list(all_keys)[i]
        # similarity between the current pl and other pls
        dist_dict_temp = {x:-np.inf for x in all_keys}
        for other_pl in all_keys:
            if this_pl != other_pl:
                dist_dict_temp[other_pl] = overlap_similarity(X[this_pl], X[other_pl]) #similarity_dict[(this_pl, other_pl)]
        dist_dict_temp = sorted(dist_dict_temp.items(), key = lambda x:x[1], reverse = True)[:n_neighbors]
        neighbor_dict[this_pl] = [a[0] for a in dist_dict_temp]
        neighbor_sim_dict[this_pl] = [a[1] for a in dist_dict_temp]
    return neighbor_dict, neighbor_sim_dict

def load_neighbor_map(prefix = 'X_all'):
    ''' Loads a pre-calculated neighbor map along with similarity map. '''
    with open(data_path + prefix + "_neighbor_sim_map.pkl", "rb") as f:
        neighbor_dict, neighbor_sim_dict = pickle.load(f)
    return neighbor_dict, neighbor_sim_dict

In [10]:
# # Calculate neighbor_dict, neighbor_sim_dict for the first time 
neighbor_dict, neighbor_sim_dict = get_neighbor_dict(X_all)
print(neighbor_dict[list(neighbor_dict.keys())[0]])
print(neighbor_sim_dict[list(neighbor_dict.keys())[0]])

with open(data_path + "X_all_neighbor_sim_map.pkl", "wb") as f:
    pickle.dump((neighbor_dict, neighbor_sim_dict), f)
    
# Load saved neighbor_dict, neighbor_sim_dict
neighbor_dict, neighbor_sim_dict = load_neighbor_map(prefix = 'X_all')

0
999
1998
2997
3996
4995
5994
6993
7992
8991
9990
['835_542', '350_59', '465_645', '546_161', '730_669', '793_316', '827_584', '832_718', '843_654', '890_122', '115_982', '134_158', '62_367', '509_938', '33_907', '35_9', '354_189', '375_843', '38_958', '389_392', '401_227', '406_65', '410_772', '42_850', '441_485', '449_295', '45_544', '5_476', '503_429', '505_685', '52_639', '537_312', '540_87', '577_192', '588_255', '591_112', '592_359', '594_976', '598_479', '603_163', '613_638', '624_551', '631_834', '638_907', '640_189', '651_785', '652_334', '654_949', '692_959', '729_320', '779_206', '779_279', '785_448', '795_836', '797_820', '801_341', '805_986', '808_668', '813_75', '815_36', '830_801', '843_948', '845_905', '859_386', '863_360', '866_727', '891_406', '895_755', '897_629', '90_445', '903_928', '928_957', '937_444', '939_638', '948_692', '959_485', '969_284', '106_870', '112_333', '125_42', '125_380', '163_122', '173_633', '177_523', '184_198', '193_815', '2_603', '212_165', 

In [13]:
def knn_ranking_algo(X, n_neighbors = 100, n_preds = 10):    
    ''' The main algorithm that predicts n_preds songs for each of the input playlist. '''
    # Update the neighbor_dict and neighbor_sim_dict based on mode
    neighbor_dict, neighbor_sim_dict = load_neighbor_map(prefix = 'X_all')
    pl_orig_song_lookup = X.copy()
    
    predictions = []
    num_runs = len(pl_orig_song_lookup.keys())
    # Loop over all of the 
    for pl_id, contained in pl_orig_song_lookup.items():
        if len(predictions) % (num_runs//50) == 0: print(len(predictions))
        non_contain = [h for h in pop_sorted_tracks if h not in contained]
        # For all the non_contain songs and all neighbors of pl_id, calculates the score
        song_scores = []
        for a_song in non_contain:
            score = 0
            neighbors = neighbor_dict[pl_id]
            for neighbor_ind, neighbor in enumerate(neighbors):
                neighbor_songs = pl_orig_song_lookup[neighbor]
                if a_song in neighbor_songs:
                    score += neighbor_sim_dict[pl_id][neighbor_ind]
            song_scores.append(score)
        # Predict the songs with the highest score
        best_songs = np.array(song_scores).argsort()[-n_preds:][::-1]
        pred = [non_contain[i] for i in best_songs]
        predictions.append(pred)
    return predictions


In [14]:
def overlap_score(tracks, pred_tracks, test_size = 10):
    ''' Computes the overlap score for tracks and pred_tracks. 
        returns #overlap'''
    assert len(tracks) == len(pred_tracks) == test_size
    return sum([a in tracks for a in pred_tracks])

def avg_overlap(true_dict, pred):
    ''' Returns the accuracy score given true_label and pred'''
    assert len(true_dict) == len(pred)
    avg_overlap = np.mean([overlap_score(a, b) for a,b in zip(true_dict.values(), pred)])    
    return avg_overlap

# Predict and Evaluation

In [19]:
# predictions = knn_ranking_algo(X_all, n_neighbors = 100, n_preds = 10)
print("Average overlap score is:", avg_overlap(y_all, predictions))

Average overlap score is: 0.7987597519503901


In [16]:
with open(data_path + "all_data_predictions_knn_ranking.pkl", "wb") as f:
    pickle.dump(predictions, f)