## Imports

In [1]:
# Imports

import pandas as pd
from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Dataset Loading

In [2]:
# Dataset loading

artists = pd.read_csv("data/artists.dat", delimiter='\t')
tags = pd.read_csv("data/tags.dat", delimiter='\t')
users_artists = pd.read_csv("data/user_artists.dat", delimiter='\t')
users_friends = pd.read_csv("data/user_friends.dat", delimiter='\t')
users_taggedartists = pd.read_csv("data/user_taggedartists.dat", delimiter='\t')
users_taggedartists_time = pd.read_csv("data/user_taggedartists-timestamps.dat", delimiter='\t')

## Data preprocessing

In [3]:
# We'll consider an artistID as valid if:
#   1. They've had minimal interaction with any userID
#   2. We have metadata for that artistID
valid_artists = set(users_artists['artistID']).intersection(set(artists['id']))

# Drop any rows with invalid artistIDs from the user-artist-tag matrix
#   i.e. We can't do CBF with an artist we don't have listening time for or that we can't visualize later on
users_taggedartists = users_taggedartists[users_taggedartists['artistID'].isin(valid_artists)]
users_artists = users_artists[users_artists['artistID'].isin(valid_artists)]


# Making sure we have metadata for tags (for visualization later)
valid_tags = set(users_taggedartists['tagID'])
users_taggedartists = users_taggedartists[users_taggedartists['tagID'].isin(valid_tags)]


## Load artist and user IDs

In [4]:
# Starting off, look into the user-artist-tag table to extract artist and tag IDs
cb_artist_ids = set(users_taggedartists['artistID'])
tag_ids = set(users_taggedartists['tagID'])


# Do the same for CF. Also the final userset comes from this matrix.
#   i.e. these are the users that we'll train and test the system on
cf_artist_ids = set(users_artists['artistID'])
user_ids = set(users_artists['userID'])

# Then cross-check cf_artists with the artist table (need metadata to visualize)
cf_artist_ids = cf_artist_ids.intersection(set(artists['id']))


## Train / Test split

In [5]:
from sklearn.model_selection import train_test_split

train_list = []
test_list = []

for user_id, group in users_artists.groupby('userID'):

    # Minimum threshold to consider a user's data impactful
    if len(group) < 5:
        train_list.append(group)
    else:
        train, test = train_test_split(group, test_size=0.2, random_state=42)
        train_list.append(train)
        test_list.append(test)

# Concatenate final datasets
train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print(f"Full dataset size: {len(users_artists)}")
print(f"Train set size: {len(train_df)} -> {len(train_df)/len(users_artists):.2f}")
print(f"Train set size: {len(test_df)} -> {len(test_df)/len(users_artists):.2f}")

Full dataset size: 92834
Train set size: 74256 -> 0.80
Train set size: 18578 -> 0.20


## Artists - Tags Dataframe

In [6]:
# Create an artists-tags dict
tag_N = len(tag_ids)
artists_tags_dict = {k: np.full(tag_N,np.nan) for k in cb_artist_ids}
# Mapping between the tag_id actual values and their indexes in a list
tagmap = {tag_id: tag_idx for tag_idx, tag_id in enumerate(tag_ids)}

grouped = (users_taggedartists.groupby(['artistID','tagID']).size().to_dict())
for (artist_id,tag_id) , count in grouped.items():
    artists_tags_dict[artist_id][tagmap[tag_id]] = count

for artist_id, raw_tag_counts in artists_tags_dict.items():
    artists_tags_dict[artist_id] = raw_tag_counts/np.nanmax(raw_tag_counts)

# Dict -> DF -> numpy Array for better calculations
intermediate_df = pd.DataFrame(data=artists_tags_dict)
array = np.array(intermediate_df)

N = len(cb_artist_ids)
for idx, tag_tfs in enumerate(array):
    idf = np.log(N/np.sum(~np.isnan(tag_tfs)))
    array[idx] = tag_tfs * idf


# Back to DF for interpretability
artists_tags_df = pd.DataFrame(data=array.transpose(), index=list(cb_artist_ids))
artists_tags_df.columns = list(tag_ids)


## Users - Artists - Listening times for Train / Test

In [7]:
from sklearn.preprocessing import MinMaxScaler

# Mapping the list index of the artistIDs to the actual values
cb_artist_reverse_map = {idx : artist_id for idx, artist_id in enumerate(cb_artist_ids)}

# Dict to keep each user's training set interactions PLUS the minmax scaler fit for their specific listening times
user_weights = {}
for idx, row in train_df.iterrows():
    artist_id = row['artistID']
    user_id = row['userID']
    weight = row['weight']

    log_weight = np.log1p(weight)   # Log scaling to keep the impact of very high weights 

    if(user_id in user_weights.keys()):
        user_weights[user_id]['weights'][artist_id] = weight
    else:
        user_weights[user_id] = {
            'weights': {
                artist_id: weight
            },
            'scaler' : MinMaxScaler()
        }

# Fit each user's minmax scaler and transform the training log weights
for user_id, items in user_weights.items():
    weights = items['weights']
    scaler = items['scaler']
    
    scaler.fit(np.array(list(weights.values())).reshape(-1,1))    # Scaling on the weights to help with rating prediction

    for artist_id, weight in weights.items():
        user_weights[user_id]['weights'][artist_id] = scaler.transform(np.array(weight).reshape(1,-1))


# Now create a dict to keep each user's test set interactions scaled by their specific scaler
user_weights_test = {}
for idx, row in test_df.iterrows():
    artist_id = row['artistID']
    user_id = row['userID']
    weight = row['weight']

    scaler = user_weights[user_id]['scaler']
    log_weight = np.log1p(weight)   # Log scaling to keep the impact of very high weights 
    scaled_weight = scaler.transform(np.array(weight).reshape(1,-1))


    if(user_id in user_weights_test.keys()):
        user_weights_test[user_id]['weights'][artist_id] = scaled_weight
    else:
        user_weights_test[user_id] = {
            'weights': {                        # Keeping dict structure consistent between train and test
                artist_id: scaled_weight
            }
        }

In [257]:
artist_profile = np.nan_to_num(artists_tags_df.loc[1],0).reshape(-1,1)
# print(artist_profile.shape)
# print(user_weights[2]['weights'][51].s)
# print(artists_tags_df.loc[1][139])
print(len(tags))

11946


## Build user profiles

In [49]:
user_profiles = {}

for user_id in user_weights.keys():
    user_profile = np.zeros(len(tag_ids))
    for artist_id, weight in user_weights[user_id]['weights'].items():

        # artist has to be tagged
        if(artist_id in cb_artist_ids):
            artist_profile = np.nan_to_num(artists_tags_df.loc[artist_id],0)
            user_profile += weight.item() * artist_profile

    user_profiles[user_id] = user_profile.reshape(1,-1)


In [None]:

# The recommendation method
def recommend_cbf(user_id,k=1,eval=False):
    if(user_id in user_ids):
        recommendations = { 'artist_ids': [], 'similarities': []}
        user_profile = user_profiles[user_id]

        similarities = cosine_similarity(user_profile.reshape(1,-1),artists_tags_df.fillna(0))   # Returns cos similarities with every row
        top_sim = np.argsort(similarities[0])[::-1]                                              # Sorts the indexes in descending order

        count = 0
        i = 0

        while count < k and i<len(top_sim):
            idx = top_sim[i]
            artist_id = cb_artist_reverse_map[idx]

            # If we're evaluating, we want similar artists that the user has listened to, in order to calculate a predicted rating
            if (not eval):
                if (artist_id not in user_weights[user_id]['weights'].keys()):
                    recommendations['artist_ids'].append(artist_id)
                    recommendations['similarities'].append(similarities[:,idx])
                    count+=1

            # If not, we're interested only in new artists that the user hasn't listened to before
            else:
                if(artist_id in user_weights[user_id]['weights'].keys()):
                    
                    recommendations['artist_ids'].append(artist_id)
                    recommendations['similarities'].append(similarities[:,idx])
                    count+=1

            i+=1

        return recommendations
    else:
        return None
    

def get_similarity(user_id,artist_id):
    sim_score = cosine_similarity(user_profiles[user_id],np.nan_to_num(artists_tags_df.loc[artist_id],0).reshape(1,-1))
    return sim_score

1877


## Precision @ top10

In [None]:
from tqdm import tqdm

precisions = []

for user_id, items in tqdm(user_weights_test.items()):
    relevant_artists = items['weights'].keys()
    predicted_artists = recommend_cbf(user_id,10)['artist_ids']

    hits = len(relevant_artists & predicted_artists)
    precisions.append(hits/10)

precision_at_10 = sum(precisions)/len(precisions)
print(precision_at_10)



  2%|▏         | 34/1877 [00:40<35:17,  1.15s/it]

---

# Collaborative Filtering

### DF init

In [None]:
cf_array = np.zeros((len(cf_artist_ids),len(user_ids)))
cf_df = pd.DataFrame(cf_array,index=list(cf_artist_ids))
cf_df.columns = list(user_ids)

for user_id, items in user_weights.items():
    for artist_id, weight in items['weights'].items():
        cf_df.loc[artist_id,user_id] = weight

17632


In [119]:
means = cf_dataframe.mean(axis=1, skipna=True)
new = cf_dataframe.sub(means.values,axis=0)

In [135]:
from sklearn.neighbors import NearestNeighbors

k = 10
nbrs = NearestNeighbors(n_neighbors=k+1, metric='cosine', algorithm='brute')
# nbrs.fit(new.fillna(0))
# distances, indices = nbrs.kneighbors(new.fillna(0))
nbrs.fit(cf_dataframe)
distances, indices = nbrs.kneighbors(cf_dataframe)

similarities = 1 - distances[:, 1:]
neighbor_indices = indices[:, 1:]

In [125]:
cf_artist_map = {artist_idx : artist_id for artist_idx, artist_id in enumerate(cf_artist_ids)}
mapped_neighbor_indices = np.vectorize(cf_artist_map.get)(neighbor_indices)

neighbor_df = pd.DataFrame(
    mapped_neighbor_indices,
    columns=[f'neighbor_{i+1}' for i in range(k)],
    index=cf_dataframe.index
)

similarity_df = pd.DataFrame(
    similarities,
    columns=[f'similarity_{i+1}' for i in range(k)],
    index=cf_dataframe.index
)

print(neighbor_df)

       neighbor_1  neighbor_2  neighbor_3  neighbor_4  neighbor_5
1            5077        5092        5078        5095        5086
2            3188        1303        1300        5713        5712
3           13677       13679        7925       10894       18730
4            9236        9238        9237        9241        9246
5           15289        8044        8049        8046        8045
...           ...         ...         ...         ...         ...
18741       18736       18737       18738       18739       18744
18742       18736       18737       18738       18739       18744
18743       18736       18737       18738       18739       18744
18744       18736       18737       18738       18739       18744
18745       18745       11394       18458       15996        3683

[17632 rows x 5 columns]


In [None]:
for artist_idx, neighbors in neighbor_df.iterrows():
    

2
3


In [None]:
print(cosine_df)

In [None]:
print(indices)

In [None]:
cf_dataframe = pd.DataFrame(data=cf_user_artists).transpose()
cf_dataframe.columns = list(cf_user_ids)
x_split = int(np.floor(len(cf_dataframe)*0.2))
y_split = int(np.floor(len(cf_dataframe.columns)*0.2))

# print(cf_dataframe)
eval_df = cf_dataframe.copy()
# print(cf_test_df)
eval_df.iloc[-x_split:, -y_split:] = eval_df.iloc[-x_split:, -y_split:].where(eval_df.iloc[-x_split:, -y_split:] == 0, np.nan)

print(eval_df.iloc[-x_split:, -y_split:])
print(cf_dataframe.iloc[-x_split:, -y_split:])




In [None]:
i=0
for idx, col in test.items():
    if(not np.count_nonzero(col)):
        print(col[col>0])

print(i)

In [None]:
min_count = 20000
min_idx = 0
for idx,col in cf_dataframe.items():
    count = np.count_nonzero(col)
    if count <= min_count:
        min_count= count
        min_idx = idx


print(min_count, min_idx)

In [None]:
print(test)

In [None]:
def predict_hybrid(user_id, target_artist, listening_df, similarities_df, artist_features_df, 
                   alpha=0.7, k=5):
    """
    Combine collaborative and content-based predictions
    alpha: weight for collaborative filtering (0.7 = 70% collaborative, 30% content)
    """
    
    # Try collaborative filtering first
    collab_pred = predict_weighted_average(user_id, target_artist, similarities_df, listening_df, k)
    
    # Content-based prediction
    content_pred = predict_content_based(user_id, target_artist, listening_df, artist_features_df)
    
    # If no collaborative data available, use pure content-based
    if collab_pred == 0:
        return content_pred
    
    # If no content data available, use pure collaborative
    if content_pred == 0:
        return collab_pred
    
    # Hybrid combination
    return alpha * collab_pred + (1 - alpha) * content_pred