# Imports

In [109]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

# Importing Data from CSV

In [110]:
# Import and display the CSV file
tracks = pd.read_csv('tracks_transformed.csv')
tracks.head()

Unnamed: 0,id,name,artists,id_artists,genres,release_year,duration_s,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,35iwgR4jXetI318WEWsa1Q,Carve,Uli,['45tIt06XoI0Iio4LBEVpls'],,1922,126.903,6,0.645,0.445,0,46.662,1,0.451,0.674,0.744,0.151,0.127,104.851
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,Fernando Pessoa,['14jtPCOoNZwquk5wd9DxrY'],,1922,98.2,0,0.695,0.263,0,37.864,1,0.957,0.797,0.0,0.148,0.655,102.009
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,Ignacio Corsini,['5LiOoJbxVSAMkBS2fUm3X2'],tango vintage tango,1922,181.64,0,0.434,0.177,1,38.82,1,0.0512,0.994,0.0218,0.212,0.457,130.418
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,Ignacio Corsini,['5LiOoJbxVSAMkBS2fUm3X2'],tango vintage tango,1922,176.907,0,0.321,0.0946,7,32.039,1,0.0504,0.995,0.918,0.104,0.397,169.98
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,Dick Haymes,['3BiJGZsyX9sJchTqcSA7Su'],adult standards big band easy listening lounge...,1922,163.08,0,0.402,0.158,3,43.1,0,0.039,0.989,0.13,0.311,0.196,103.22


In [111]:
# Import and display artist data
artists = pd.read_csv('./artists_transformed.csv')
artists.head()

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,,Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,,ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,,Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,,Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,,Ioannis Panoutsopoulos,0


In [112]:
# Import and display genre data
genres = pd.read_csv('./data_by_genres_o.csv', na_filter=False)
genres.head()

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5
2,1,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0,7
3,1,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7
4,1,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7


# Data Prepping

In [113]:
# Check for null rows in the data
null_rows_artist = (artists.isnull().any(axis=1)).sum()
null_rows_tracks = (tracks.isnull().any(axis=1)).sum()
null_rows_genre = (genres.isnull().any(axis=1)).sum()

print(f"Number of rows with null values for artist: {null_rows_artist}")
print(f"Number of rows with null values for tracks: {null_rows_tracks}")
print(f"Number of rows with null values for genre: {null_rows_genre}")

Number of rows with null values for artist: 805739
Number of rows with null values for tracks: 49825
Number of rows with null values for genre: 0


In [114]:
# Remove rows with null values 
artists = artists.dropna()
tracks = tracks.dropna()

In [115]:
# Transform followers to numeric -> helps with further processing and analysis
artists['followers'] = pd.to_numeric(artists['followers'])

# Song Recommendation Functionality

In [129]:
# Since the dataset is quite large, for now I will only make use of the first 5000 most popular songs. This number can be adjusted for the actual implementation in MySpace.
popular_songs = tracks.sort_values(by=['popularity'], ascending=False).head(10000)
popular_songs.head()

Unnamed: 0,id,name,artists,id_artists,genres,release_year,duration_s,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
93802,4iJyoBOLtHqaGxP12qzhQI,Peaches (feat. Daniel Caesar & Giveon),"Justin Bieber, Daniel Caesar, Giveon","['1uNFoZAHBGtllmzznpCI3s', '20wkVLutqVOYrc0kxF...",pop rb canadian contemporary rb pop rb canadia...,2021,198.082,100,0.677,0.696,0,53.819,1,0.119,0.321,0.0,0.42,0.464,90.03
93803,7lPN2DXiMsVn7XUKtOW1CS,drivers license,Olivia Rodrigo,['1McMsnEElThX1knmY4oliG'],pop postteen pop,2021,242.014,99,0.585,0.436,10,51.239,1,0.0601,0.721,1.3e-05,0.105,0.132,143.874
93804,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,Masked Wolf,['1uU7g3DNSbsu0QjSEqZtEd'],australian hip hop,2021,132.78,98,0.778,0.695,4,53.135,0,0.0913,0.175,0.0,0.15,0.472,149.996
92811,6tDDoYIxWvMLTdKpjFkc1B,telepatía,Kali Uchis,['1U1el3k54VvEUzo3ybLPlM'],colombian pop pop,2020,160.191,97,0.653,0.524,11,50.984,0,0.0502,0.112,0.0,0.203,0.553,83.97
92810,5QO79kh1waicV47BqGRL3g,Save Your Tears,The Weeknd,['1Xyo4u8uXC1ZmMpatF05PJ'],canadian contemporary rb canadian pop pop,2020,215.627,97,0.68,0.826,0,54.513,1,0.0309,0.0212,1.2e-05,0.543,0.644,118.051


In [130]:
# Transform textual genre information(textual) into a numerical format that can be used for similarity calculations.
vectorizer = CountVectorizer()
vectorizer.fit(popular_songs['genres'])

In [131]:
# This function suggests 5 songs based on a song name which is given as a parameter
# A similarity score is generated for each song, the songs wtih the highest similarity and popularity are returened.

def song_recommender(song_name):
    try:
        # Columns for numerical features
        numeric_cols = ['release_year', 'duration_s', 'popularity', 'danceability', 'energy', 'key', 'loudness',
                    'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

        # Check to see if the song exists
        song = popular_songs[popular_songs['name'] == song_name]
        
        if song.empty:
            print(f'{song_name} not found.')
            return

        # Create vectors for the given song -> The cosine similarity function requires numerical vectors to compute the similarity between items
        song_genre_vect = vectorizer.transform(song['genres']).toarray()
        song_feature_vect = song[numeric_cols].to_numpy()

        # Calculate similarity scores
        def calculate_similarity(row):
            other_song_genre_vect = vectorizer.transform([row['genres']]).toarray()
            other_song_feature_vect = row[numeric_cols].to_numpy().reshape(1, -1)

            genre_similarity = cosine_similarity(song_genre_vect, other_song_genre_vect)[0][0]
            feature_similarity = cosine_similarity(song_feature_vect, other_song_feature_vect)[0][0]

            return (genre_similarity + feature_similarity) / 2

        popular_songs['similarity'] = popular_songs.apply(calculate_similarity, axis=1)

        # Sort by similarity and popularity
        recommended_songs = popular_songs.sort_values(by=['similarity', 'popularity'],
                                                      ascending=[False, False])

        # Select top 5 most similar songs
        recommended_songs = recommended_songs[['name', 'artists', 'release_year']].iloc[1:6]

        return recommended_songs

    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:
song_recommender('Save Your Tears')


# Artist Recommendation Functionality

In [None]:
# Since the dataset is quite large, for now I will only make use of the first 5000 most popular artists. This number can be adjusted for the actual implementation in MySpace.
popular_artists = artists.sort_values(by=['popularity', 'followers'], ascending=[False, False]).head(5000)
popular_artists.head()


In [None]:
# Transform textual genre information(textual) into a numerical format that can be used for similarity calculations.
artist_vectorizer = CountVectorizer()
artist_vectorizer.fit(popular_artists['genres'])

In [None]:
# This function suggests 5 artists based on an artist name which is given as a parameter
# A similarity score is generated for each artist, the artist wtih the highest similarity and popularity are returened.

def artist_recommender(artist_name):
    try:
        # Columns for numerical features
        numeric_cols = ['followers', 'popularity']

        # Check to see if the song exists
        artist = popular_artists[popular_artists['name'] == artist_name]
        
        if artist.empty:
            print(f'{artist_name} not found.')
            return

        # Create vectors for the given artist -> The cosine similarity function requires numerical vectors to compute the similarity between items
        artist_genre_vect = artist_vectorizer.transform(artist['genres']).toarray()
        artist_feature_vect = artist[numeric_cols].to_numpy()

        # Calculate similarity scores
        def calculate_similarity(row):
            other_artist_genre_vect = artist_vectorizer.transform([row['genres']]).toarray()
            other_artist_feature_vect = row[numeric_cols].to_numpy().reshape(1, -1)

            genre_similarity = cosine_similarity(artist_genre_vect, other_artist_genre_vect)[0][0]
            feature_similarity = cosine_similarity(artist_feature_vect, other_artist_feature_vect)[0][0]

            return (genre_similarity + feature_similarity) / 2

        popular_artists['similarity'] = popular_artists.apply(calculate_similarity, axis=1)

        # Sort by similarity and popularity
        recommended_artists = popular_artists.sort_values(by=['similarity', 'popularity', 'followers'],
                                                      ascending=[False, False, False])

        recommended_artists = recommended_artists[recommended_artists['name'] != artist_name]
        recommended_artists = recommended_artists[['name', 'genres', 'followers', 'popularity']].iloc[:5]

        return recommended_artists

    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:
artist_recommender('Taylor Swift')