In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/BDA_dataset/SpotifyFeatures.csv")
# data = data.sample(n=5000, random_state=3)
data = data.head(5000)

In [18]:
data

Unnamed: 0,artists,track_name,album_name,duration_ms,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Gen Hoshino,comedy,Comedy,230666,73,0.0322,0.6760,0.4610,0.000001,0.3580,-6.746,0,0.1430,87.917,4,0.7150
1,Ben Woodward,ghost - acoustic,Ghost (Acoustic),149610,55,0.9240,0.4200,0.1660,0.000006,0.1010,-17.235,1,0.0763,77.489,4,0.2670
2,Ingrid Michaelson;ZAYN,to begin again,To Begin Again,210826,57,0.2100,0.4380,0.3590,0.000000,0.1170,-9.734,1,0.0557,76.332,4,0.1200
3,Kina Grannis,can't help falling in love,Crazy Rich Asians (Original Motion Picture Sou...,201933,71,0.9050,0.2660,0.0596,0.000071,0.1320,-18.515,1,0.0363,181.740,3,0.1430
4,Chord Overstreet,hold on,Hold On,198853,82,0.4690,0.6180,0.4430,0.000000,0.0829,-9.681,1,0.0526,119.949,4,0.1670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Ludovico Einaudi,drop,Elements (Deluxe),300293,49,0.9920,0.4650,0.0342,0.939000,0.1030,-28.257,0,0.0401,152.054,4,0.0590
4996,Air,kelly watch the stars,Moon Safari,226293,55,0.2820,0.5160,0.7430,0.637000,0.1110,-8.470,1,0.0292,109.837,4,0.2260
4997,Ludovico Einaudi,passagio,Islands - Essential Einaudi,357773,50,0.9950,0.2220,0.0200,0.948000,0.0945,-27.114,1,0.0464,85.860,4,0.1190
4998,Liquid Mind,blue seven,Liquid Mind II: Slow World,760053,24,0.9030,0.0576,0.1690,0.966000,0.1170,-20.216,1,0.0484,50.838,4,0.0310


In [10]:
# Select the relevant features for recommendation
features = ['artists', 'track_name','album_name', 'duration_ms','popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence']
data = data[features]


In [11]:
# Preprocess the data
data = data.dropna()
data = data.drop_duplicates(subset='track_name')


In [12]:
# Convert the track_name column to lowercase
data['track_name'] = data['track_name'].str.lower()

In [13]:
# Create a CountVectorizer object and fit it to the track_name column
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['track_name'])


In [14]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity( count_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [15]:
def get_recommendations(track_name, data, cosine_sim):
    # Make sure the track name is in the DataFrame
    if track_name not in data['track_name'].values:
        print(f'Track "{track_name}" not found in the dataset.')
        return pd.DataFrame()
    
    # Get the index of the track
    idx = data[data['track_name'] == track_name].index[0]
    
    # Get similarity scores for all tracks
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort tracks by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top 10 similar tracks (excluding itself)
    track_indices = [i[0] for i in sim_scores if i[0] != idx][:10]
    
    # Get the data of the recommended tracks
    recommended_tracks = data.iloc[track_indices][['track_name', 'artists', 'album_name']]
    
    return recommended_tracks


In [23]:
# random_track_name = data['track_name'].sample().iloc[0]
random_track_name = data.iloc[4][1]
# get recommendations for the random track
recommendations = get_recommendations(random_track_name, data, cosine_sim)

# print the recommendations
print(f'Recommended tracks for "{random_track_name}":\n{recommendations}')


Recommended tracks for "hold on":
                              track_name                       artists  \
14                       hold on - remix      Chord Overstreet;Deepend   
233                         hand to hold                     JJ Heller   
234                  hold you in my arms                Ray LaMontagne   
325                   hold on - acoustic              Chord Overstreet   
709   hold each other (feat. futuristic)  A Great Big World;Futuristic   
4603            versailles (hold) - edit             Christian Löffler   
0                                 comedy                   Gen Hoshino   
1                       ghost - acoustic                  Ben Woodward   
2                         to begin again        Ingrid Michaelson;ZAYN   
3             can't help falling in love                  Kina Grannis   

                                             album_name  
14                                      Hold On (Remix)  
233                                

In [22]:
random_track_name = data.iloc[4][1]
random_track_name

'hold on'