In [332]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import joblib

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data

Unnamed: 0,Usuario,Artista 1,Artista 2,Artista 3,Artista 4,Artista 5,Artista 6,Artista 7,Artista 8,Artista 9,...,Artista 21,Artista 22,Artista 23,Artista 24,Artista 25,Artista 26,Artista 27,Artista 28,Artista 29,Artista 30
0,agoraphobictoe,Billie Eilish,Melanie Martinez,Taylor Swift,Eminem,NF,Hollywood Undead,twenty one pilots,Halsey,Olivia Rodrigo,...,Nessa Barrett,nicole amoroso,The Neighbourhood,Ed Sheeran,Harry Styles,The Weeknd,James Arthur,One Direction,Panic! at the Disco,XXXTENTACION
1,INTRENTR,Billie Eilish,Nessa Barrett,XXXTENTACION,Chris Grey,Paramore,Maggie Lindemann,Lely45,Artemas,Olivia Rodrigo,...,Jerry Heil,Big Baby Tape,Morgenstern,Pharaoh,Our Last Night,Nothing But Thieves,Pussykiller,Discollusion,Yeat,INSTASAMKA
2,Eibrunosilvah,Billie Eilish,Beyoncé,Doja Cat,Kali Uchis,Ariana Grande,Megan Thee Stallion,Anitta,Nicki Minaj,SZA,...,Lady Gaga,Melanie Martinez,Harry Styles,The Neighbourhood,Dua Lipa,Pabllo Vittar,Lil Nas X,Sia,Ashe,GloRilla
3,Efepe69,Billie Eilish,Post Malone,Marshmello,Luísa Sonza,bbno$,Taylor Swift,Harry Styles,Trevor Daniel,d4vd,...,Diogo Defante,Lana Del Rey,The Kid LAROI,Iggy Azalea,SadBoyProlific,Oliver Tree,M83,League of Legends,7 Minutoz,Hiosaki
4,ashe_4991,Billie Eilish,Chase Atlantic,The Weeknd,Lana Del Rey,The Neighbourhood,Labrinth,Montell Fish,Isabel LaRosa,Nessa Barrett,...,Nick Alexandr,girl in red,Halsey,SYML,d4vd,Lil Peep,Nicki Minaj,Dove Cameron,Pierce the Veil,Odetari
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5499,Gamingmysteries,blink-182,Red Hot Chili Peppers,Ramones,Foo Fighters,+44,Green Day,Sum 41,The Offspring,Box Car Racer,...,Arctic Monkeys,Two Door Cinema Club,Blur,Mac DeMarco,Jimmy Eat World,The Rare Occasions,Simple Plan,Journey,The All-American Rejects,Good Charlotte
5500,mrosemberg,blink-182,Linkin Park,Foo Fighters,Paramore,Taylor Swift,+44,Fall Out Boy,Box Car Racer,Primus,...,My Chemical Romance,Rise Against,The Smashing Pumpkins,Angels & Airwaves,Green Day,Nirvana,Bob Dylan,Coldplay,Tool,Phoebe Bridgers
5501,summerhatesu,Thundercat,TWICE,st47ic,Playboi Carti,NewJeans,xaviersobased,Joji,Pierce the Veil,twikipedia,...,Mac DeMarco,glaive,Lil Uzi Vert,Lil Peep,Kanye West,Yandere,kkbutterfly27xx,Cochise,i9bonsai,PinkPantheress
5502,murdocishot,Gorillaz,Thundercat,Blur,beabadoobee,Tame Impala,The Kunts,The Beatles,Arctic Monkeys,Madness,...,Damon Albarn,Penny Bank,Burger King,DANGERDOOM,TV Girl,Childish Gambino,Freddie Gibbs,King Geedorah,Ryan Gosling,Beck


In [4]:
# Taking away the usernames
artist_data = data.iloc[:, 1:]


In [221]:
# enconding categorial data
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_artists = encoder.fit_transform(artist_data)

In [222]:
svd = TruncatedSVD(n_components=50, random_state=42)
artist_data_svd = svd.fit_transform(encoded_artists)

In [223]:
kmeans_svd = KMeans(n_clusters=5, random_state=42)
kmeans_svd.fit(artist_data_svd)
# after some tests, the n_cluster=5 was with the best silhouette avg.

In [224]:
svd_clusters = kmeans_svd.predict(artist_data_svd)

In [225]:
svd_silhouette_avg = silhouette_score(artist_data_svd, svd_clusters)
svd_silhouette_avg

0.7327779913166572

In [333]:
joblib.dump(encoder, 'encoder.joblib')
joblib.dump(svd, 'svd.joblib')
joblib.dump(kmeans_svd, 'kmeans_svd.joblib')
joblib.dump(svd_clusters, 'svd_clusters.joblib')
# transfer the model to joblib


['kmeans_svd.joblib']

Create the function to give the objective: recommend artists.

In [330]:
def recommend_artists(new_user_artists):
    # Create a dataframe with the same structure as the original artist_data
    new_user_data = pd.DataFrame([new_user_artists], columns=artist_data.columns[:len(new_user_artists)])

    # Fill missing columns with the most frequent artist
    most_frequent_artist = artist_data.mode().iloc[0]
    for col in artist_data.columns:
        if col not in new_user_data:
            new_user_data[col] = most_frequent_artist[col]

    # Ensure the columns are in the same order
    new_user_data = new_user_data[artist_data.columns]

    # Encode the new user's artists
    encoded_new_user = encoder.transform(new_user_data)

    # Reduce dimensionality using the trained TruncatedSVD model
    new_user_svd = svd.transform(encoded_new_user)

    # Predict the cluster for the new user
    new_user_cluster = kmeans_svd.predict(new_user_svd)[0]

    # Get all users in the same cluster
    same_cluster_users = np.where(svd_clusters == new_user_cluster)[0]

    # Calculate weighted similarity scores for users in the same cluster
    weights = np.linspace(1, 0.5, num=len(new_user_artists))
    similarity_scores = []
    for user_idx in same_cluster_users:
        user_data = artist_data.iloc[user_idx].values
        common_artists = set(new_user_artists) & set(user_data)
        score = sum(weights[new_user_artists.index(artist)] for artist in common_artists if artist in new_user_artists)
        similarity_scores.append((user_idx, score))
    
    # Sort users by similarity scores in descending order
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Collect artists from the top similar users
    recommended_artists = []
    for user_idx, _ in similarity_scores:
        user_artists = artist_data.iloc[user_idx].values.flatten()
        for artist in user_artists:
            if artist not in new_user_artists and artist not in recommended_artists:
                recommended_artists.append(artist)
            if len(recommended_artists) >= 5:
                break
        if len(recommended_artists) >= 5:
            break

    return recommended_artists