### Imports

In [None]:
# Imports

import pandas as pd
from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import root_mean_squared_error

### Dataset Loading

In [None]:
# Dataset loading

artists = pd.read_csv("data/artists.dat", delimiter='\t')
tags = pd.read_csv("data/tags.dat", delimiter='\t')
users_artists = pd.read_csv("data/user_artists.dat", delimiter='\t')
users_taggedartists = pd.read_csv("data/user_taggedartists.dat", delimiter='\t')

### Data preprocessing

In [3]:
# We'll consider an artistID as valid if:
#   1. They've had minimal interaction with any userID
#   2. We have metadata for that artistID
valid_artists = set(users_artists['artistID']).intersection(set(artists['id']))

# Drop any rows with invalid artistIDs from the user-artist-tag matrix
#   i.e. We can't do CBF with an artist we don't have listening time for or that we can't visualize later on
users_taggedartists = users_taggedartists[users_taggedartists['artistID'].isin(valid_artists)]
users_artists = users_artists[users_artists['artistID'].isin(valid_artists)]


# Making sure we have metadata for tags (for visualization later)
valid_tags = set(users_taggedartists['tagID'])
users_taggedartists = users_taggedartists[users_taggedartists['tagID'].isin(valid_tags)]


### Load artist and user IDs

In [4]:
# Starting off, look into the user-artist-tag table to extract artist and tag IDs
cb_artist_ids = set(users_taggedartists['artistID'])
tag_ids = set(users_taggedartists['tagID'])


# Do the same for CF. Also the final userset comes from this matrix.
#   i.e. these are the users that we'll train and test the system on
cf_artist_ids = set(users_artists['artistID'])
user_ids = set(users_artists['userID'])

# Then cross-check cf_artists with the artist table (need metadata to visualize)
cf_artist_ids = cf_artist_ids.intersection(set(artists['id']))


### Train / Test split

In [5]:
train_list = []
test_list = []

for user_id, group in users_artists.groupby('userID'):

    # Minimum threshold to consider a user's data impactful
    if len(group) < 5:
        train_list.append(group)
    else:
        train, test = train_test_split(group, test_size=1, random_state=42)
        train_list.append(train)
        test_list.append(test)

# Concatenate final datasets
train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print(f"Full dataset size: {len(users_artists)}")
print(f"Train set size: {len(train_df)} -> {len(train_df)/len(users_artists):.2f}")
print(f"Train set size: {len(test_df)} -> {len(test_df)/len(users_artists):.2f}")

Full dataset size: 92834
Train set size: 90957 -> 0.98
Train set size: 1877 -> 0.02


### Artists - Tags Dataframe

In [None]:
# Create an artists-tags dict
tag_N = len(tag_ids)
artists_tags_dict = {k: np.full(tag_N,np.nan) for k in cb_artist_ids}
# Mapping between the tag_id actual values and their indexes in a list
tagmap = {tag_id: tag_idx for tag_idx, tag_id in enumerate(tag_ids)}

grouped = (users_taggedartists.groupby(['artistID','tagID']).size().to_dict())
for (artist_id,tag_id) , count in grouped.items():
    artists_tags_dict[artist_id][tagmap[tag_id]] = count

for artist_id, raw_tag_counts in artists_tags_dict.items():
    artists_tags_dict[artist_id] = raw_tag_counts/np.nanmax(raw_tag_counts)

# Dict -> DF -> numpy Array for better calculations
intermediate_df = pd.DataFrame(data=artists_tags_dict)
array = np.array(intermediate_df)

N = len(cb_artist_ids)
for idx, tag_tfs in enumerate(array):
    idf = np.log(N/np.sum(~np.isnan(tag_tfs)))
    array[idx] = tag_tfs * idf


# Back to DF for interpretability
artists_tags_df = pd.DataFrame(data=array.transpose(), index=list(cb_artist_ids))
artists_tags_df.columns = list(tag_ids)

# artists_tags_df.to_csv('./data/artists_tags.csv')

### Users - Artists - Listening times for Train / Test

In [None]:


# Mapping the list index of the artistIDs to the actual values
cb_artist_reverse_map = {idx : artist_id for idx, artist_id in enumerate(cb_artist_ids)}

# Dict to keep each user's training set interactions PLUS the minmax scaler fit for their specific listening times
user_weights = {}
for idx, row in train_df.iterrows():
    artist_id = row['artistID']
    user_id = row['userID']
    weight = row['weight']

    log_weight = np.log1p(weight)   # Log scaling to keep the impact of very high weights 

    if(user_id in user_weights.keys()):
        user_weights[user_id]['weights'][artist_id] = log_weight
    else:
        user_weights[user_id] = {
            'weights': {
                artist_id: log_weight
            },
            'scaler' : MinMaxScaler()
        }

# Fit each user's minmax scaler and transform the training log weights
# for user_id, items in user_weights.items():
#     weights = items['weights']
#     scaler = items['scaler']
    
#     scaler.fit(np.array(list(weights.values())).reshape(-1,1))    # Scaling on the weights to help with rating prediction

#     for artist_id, weight in weights.items():
#         user_weights[user_id]['weights'][artist_id] = scaler.transform(np.array(weight).reshape(1,-1))


# Now create a dict to keep each user's test set interactions scaled by their specific scaler
user_weights_test = {}
for idx, row in test_df.iterrows():
    artist_id = row['artistID']
    user_id = row['userID']
    weight = row['weight']

    scaler = user_weights[user_id]['scaler']
    log_weight = np.log1p(weight)   # Log scaling to keep the impact of very high weights 
    # scaled_weight = scaler.transform(np.array(log_weight).reshape(1,-1))


    if(user_id in user_weights_test.keys()):
        user_weights_test[user_id]['weights'][artist_id] = log_weight
    else:
        user_weights_test[user_id] = {
            'weights': {                        # Keeping dict structure consistent between train and test
                artist_id: log_weight
            }
        }

## Build user profiles

In [None]:
user_profiles = {}

for user_id in user_weights.keys():
    user_profile = np.zeros(len(tag_ids))
    for artist_id, weight in user_weights[user_id]['weights'].items():

        # artist has to be tagged
        if(artist_id in cb_artist_ids):
            artist_profile = np.nan_to_num(artists_tags_df.loc[artist_id],0)
            user_profile += weight.item() * artist_profile

    user_profiles[user_id] = user_profile.reshape(1,-1)

# np.save('./data/user_profiles.npy', user_profiles, allow_pickle=True)

### CBF Recommender

In [35]:

# The recommendation method
def recommend_cbf(user_id,k=1,new_only=True):
    if(user_id in user_ids):
        recommendations = { 'artist_ids': [], 'similarities': []}
        user_profile = user_profiles[user_id]

        similarities = cosine_similarity(user_profile.reshape(1,-1),artists_tags_df.fillna(0))   # Returns cos similarities with every row
        top_sim = np.argsort(similarities[0])[::-1]                                              # Sorts the indexes in descending order

        count = 0
        i = 0

        while count < k and i<len(top_sim):
            idx = top_sim[i]
            artist_id = cb_artist_reverse_map[idx]

            # Choose whether the recommendation is something that the user has never interacted with
            if (new_only):
                if (artist_id not in user_weights[user_id]['weights'].keys()):
                    recommendations['artist_ids'].append(artist_id)
                    recommendations['similarities'].append(similarities[:,idx])
                    count+=1

            # Otherwise recommendations may contain artists the user has already interacted with
            else:
                recommendations['artist_ids'].append(artist_id)
                recommendations['similarities'].append(similarities[:,idx])
                count+=1

            i+=1

        return recommendations
    else:
        return None

---

# Collaborative Filtering

### Data loading

In [10]:
# Initialize the artist-user interactions with NaNs
cf_users_weights = {user_id: np.full(len(cf_artist_ids),np.nan) for user_id in user_ids}

artistmap = {artist_id:idx for idx,artist_id in enumerate(cf_artist_ids)}

# Fill in the corresponding cells with the user-artist log transformed weights (from the train dataset)
for idx, row in train_df.iterrows():
    artist_id = row['artistID']
    user_id = row['userID']
    weight = row['weight']

    cf_users_weights[user_id][artistmap[artist_id]] = np.log1p(weight)

# Convert dict to DF for easy kNN calculation
cf_df = pd.DataFrame(cf_users_weights,index=list(cf_artist_ids))

# Subtract the mean for each artist -- REMOVED: not as good for implicit feedback
# means = cf_df.mean(axis='columns', skipna=True)
# cf_df = cf_df.sub(means.values, axis='rows')

### k-NN calculation for each artist

In [None]:
# Calculate each artist's k-Nearest Neighbors

k = 50
nbrs = NearestNeighbors(n_neighbors=k+1, metric='cosine', algorithm='brute')
nbrs.fit(cf_df.fillna(0))
distances, indices = nbrs.kneighbors(cf_df.fillna(0))

similarities = 1 - distances[:, 1:]
neighbor_indices = indices[:, 1:]

In [46]:
np.save('./data/similarities.npy', similarities, allow_pickle=True)
np.save('./data/neighbors.npy', neighbor_indices, allow_pickle=True)

### Neighbor and Similarity Matrices

In [12]:
cf_artist_map = {idx : artist_id for idx, artist_id in enumerate(cf_artist_ids)}

mapped_neighbor_indices = np.vectorize(cf_artist_map.get)(neighbor_indices)

neighbor_df = pd.DataFrame(
    mapped_neighbor_indices,
    columns=[f'neighbor_{i+1}' for i in range(k)],
    index=cf_df.index
)

similarity_df = pd.DataFrame(
    similarities,
    columns=[f'similarity_{i+1}' for i in range(k)],
    index=cf_df.index
)

### CF recommendation system

In [13]:
def recommend_cf(user_id,k=1,new_only=True):
    if(user_id in user_ids):
        user_dict = user_weights[user_id]['weights']
        predictions = []
        neighbours_used = []
        recommendations = {'artist_ids': [], 'predictions':[], 'neighbours_used':[]}
        for artist_id in cf_artist_ids:
            if(new_only and artist_id in user_dict.keys()):
                predictions.append(-100)
                neighbours_used.append(-1)
            else:
                neighbors = neighbor_df.loc[artist_id]
                similarities = similarity_df.loc[artist_id]

                nbr_contributions = []

                for idx, nbr_artist in enumerate(neighbors):
                    if(nbr_artist) in user_dict.keys():
                        nbr_contributions.append(user_dict[nbr_artist] * similarities.iloc[idx])

                pred_value = 0
                if(len(nbr_contributions) > 2):
                    pred_value = np.sum(nbr_contributions)/len(nbr_contributions)

                predictions.append(pred_value)
                neighbours_used.append(len(nbr_contributions))

        top_pred_indices = np.argsort(predictions)[::-1]

        for idx in top_pred_indices[:k]:
            recommendations['artist_ids'].append(cf_artist_map[idx])
            recommendations['predictions'].append(predictions[idx])
            recommendations['neighbours_used'].append(neighbours_used[idx])
            
        return recommendations
    else:
        return None

# Final Hybrid System

In [None]:
def recommend_hybrid(user_id, k=1, new_only=True):
    
    cf_rec = recommend_cf(user_id,k,new_only)
    
    cbf_rec = recommend_cbf(user_id,k,new_only)

    num_of_interactions = np.sum(~np.isnan(cf_df[user_id]))
    # If no collaborative data available, use pure content-based


    if not cf_rec or num_of_interactions<5 or cf_rec['predictions'][0] < 3.5:
        if cbf_rec:
            return cbf_rec
    else:
        return cf_rec
    

# Evaluation

## CF RMSE

### Evaluation function for CF

In [None]:
def evaluate_cf():
    y_true = []
    y_pred = []
    impossible_predictions = 0
    count=0
    for idx,row in (test_df.iterrows()):
        artist_id = row['artistID']
        user_id = row['userID']
        weight = row['weight']

        y_true.append(np.log1p(weight))

        neighbors = neighbor_df.loc[artist_id]
        similarities = similarity_df.loc[artist_id]

        user_dict = user_weights[user_id]['weights']

        nbr_contributions = []

        for idx, nbr_artist in enumerate(neighbors):
            if(nbr_artist) in user_dict.keys():
                nbr_contributions.append(user_dict[nbr_artist] * similarities.iloc[idx])

        pred_value = 0

        # If we can't find any user interaction with any of the most similar artists, the prediction is impossible
        if(nbr_contributions):
            pred_value = np.sum(nbr_contributions)/len(nbr_contributions)
            y_pred.append(pred_value)
        else:
            impossible_predictions+=1
            y_true.pop()


    return y_true,y_pred, impossible_predictions

### CF Evaluation results (RMSE)

In [None]:


y_true, y_pred_cf, impossible = evaluate_cf()
print(f"Users impossible to predict: {impossible} -> {impossible/(len(y_true)+impossible):.2f}")
print(f"CF RMSE: {root_mean_squared_error(y_true,y_pred_cf)}")

Users impossible to predict: 279 -> 0.15
CF RMSE: 3.9801906112721963


## Precision @ top10 Calculations

In [None]:
from tqdm import tqdm

precisions_cbf = []
precisions_cf = []
precisions_hybrid = []

for user_id, items in tqdm(user_weights_test.items()):
    relevant_artists = items['weights'].keys()
    cbf_pred = recommend_cbf(user_id,10)['artist_ids']
    cf_pred = recommend_cf(user_id,10)['artist_ids']
    hybrid_pred = recommend_hybrid(user_id,10)['artist_ids']

    hits_cbf = len(relevant_artists & cbf_pred)
    hits_cf = len(relevant_artists & cf_pred)
    hits_hybrid = len(relevant_artists & hybrid_pred)
    
    precisions_cbf.append(hits_cbf/10)
    precisions_cf.append(hits_cf/10)
    precisions_hybrid.append(hits_hybrid/10)

precision_at_10_cf = sum(precisions_cf)/len(precisions_cf)
precision_at_10_cbf = sum(precisions_cbf)/len(precisions_cbf)
precision_at_10_hybrid = sum(precisions_hybrid)/len(precisions_hybrid)


  0%|          | 0/1877 [00:00<?, ?it/s]

100%|██████████| 1877/1877 [2:04:52<00:00,  3.99s/it]  

0.017421417155034633
0.018327117741076183
0.018966435801811402





In [None]:
print(f"MAX POSSIBLE P@10: {1/10}")
print(f"CBF P@10 : {precision_at_10_cbf} -> {precision_at_10_cbf/(1/10):.4f}")
print(f"CF P@10: {precision_at_10_cf} -> {precision_at_10_cf/(1/10):.4f}")
print(f"Hybrid P@10: {precision_at_10_hybrid} -> {precision_at_10_hybrid/(1/10):.4f}")

MAX POSSIBLE P@10: 0.1
CBF P@10 : 0.017421417155034633 -> 0.1742
CF P@10: 0.018327117741076183 -> 0.1833
Hybrid P@10: 0.018966435801811402 -> 0.1897


### P@10 Results for each mechanism

>**Note:** We're using 1 sample per user in the test set

**MAX POSSIBLE P@10: 0.1**  (num_of_users * 1_possible_hit)/(num_of_users*10)

**CBF P@10:** 0.017421417155034633 -> 0.1742

**CF P@10:** 0.018327117741076183 -> 0.1833

**Hybrid P@10:** 0.018966435801811402 -> 0.1897

<br>

**The hybrid model predicts the users' "left-out" artist ~19% of the time when recommending at least 10 artists.**