In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sb
 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

import pickle

from gensim.models import FastText

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [3]:
df.shape

(114000, 21)

In [4]:
df = df.drop('Unnamed: 0',axis=1)

In [5]:
df.isnull().sum()

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df['track_id'].nunique(), df.shape

(89740, (113999, 20))

In [8]:
df[df.duplicated(subset = df.columns.difference(['album_name','track_id']))]

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
28,5QAMZTM5cmLg3fHX9ZbTZi,Jason Mraz,Christmas Time,Winter Wonderland,0,131760,False,0.620,0.309,5,-9.209,1,0.0495,0.788000,0.000000,0.1460,0.6640,145.363,4,acoustic
29,2qESE1ZeWly7I3YjyTXmXh,Jason Mraz,Perfect Christmas Hits,Winter Wonderland,0,131760,False,0.620,0.309,5,-9.209,1,0.0495,0.788000,0.000000,0.1460,0.6640,145.363,4,acoustic
30,3EQV1ZHtHvq9OnVRYIdbg3,Jason Mraz,Merry Christmas,Winter Wonderland,0,131760,False,0.620,0.309,5,-9.209,1,0.0495,0.788000,0.000000,0.1460,0.6640,145.363,4,acoustic
31,3ax0rfGb7exLtl02LL08U9,Jason Mraz,Christmas Music - Holiday Hits,Winter Wonderland,0,131760,False,0.620,0.309,5,-9.209,1,0.0495,0.788000,0.000000,0.1460,0.6640,145.363,4,acoustic
34,0xbMRcMFqxJq1Wa7tvWPtn,Brandi Carlile;Sam Smith,Feeling Good - Adult Pop Favorites,Party of One,0,259558,False,0.296,0.206,0,-11.799,1,0.0412,0.782000,0.000225,0.0959,0.2020,165.400,4,acoustic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113041,5izpR8PlJCjgJexf7JkMa9,Kari Jobe,Sad Covers 2022,In The Name Of Love - Recorded At Spotify Stud...,2,205738,False,0.675,0.375,6,-8.985,1,0.0452,0.542000,0.000003,0.0951,0.1870,134.023,4,world-music
113345,5WaioelSGekDk3UNQy8zaw,Matt Redman,Sing Like Never Before: The Essential Collection,Our God - New Recording,34,265373,False,0.487,0.895,11,-5.061,1,0.0413,0.000183,0.000000,0.3590,0.3840,105.021,4,world-music
113644,5uXMiTPXw21xFvyeyqxyIw,Hillsong Worship;Benjamin William Hastings,That's The Power (Live),That's The Power - Live,43,274533,False,0.454,0.635,10,-8.286,1,0.0331,0.010300,0.000000,0.2330,0.0931,148.169,4,world-music
113786,37Ms9rdo26omDpCINsHhui,Chris Tomlin;Steffany Gretzinger,"O Lord, You're Beautiful","O Lord, You’re Beautiful (with Steffany Gretzi...",43,235599,False,0.406,0.121,5,-11.650,1,0.0373,0.806000,0.000000,0.1150,0.3870,118.071,4,world-music


In [9]:
df.drop_duplicates(subset = ['track_id'], keep = 'first', inplace=True)
df.drop_duplicates(subset = df.columns.difference(['album_name','track_id']), keep = 'first', inplace=True)

In [10]:
df = df.sort_values(by=['popularity'], ascending=False).reset_index(drop=True)

In [11]:
df

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,3nqQXoyQOWXiESFLlDF1hG,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),Unholy (feat. Kim Petras),100,156943,False,0.714,0.472,2,-7.375,1,0.0864,0.01300,0.000005,0.266,0.238,131.121,4,dance
1,2tTmW7RDtMQtBk7m2rYeSw,Bizarrap;Quevedo,"Quevedo: Bzrp Music Sessions, Vol. 52","Quevedo: Bzrp Music Sessions, Vol. 52",99,198937,False,0.621,0.782,2,-5.548,1,0.0440,0.01250,0.033000,0.230,0.550,128.033,4,hip-hop
2,5ww2BF9slyYgNOk37BlC4u,Manuel Turizo,La Bachata,La Bachata,98,162637,False,0.835,0.679,7,-5.329,0,0.0364,0.58300,0.000002,0.218,0.850,124.980,4,latin
3,4uUG5RXrOk84mYEfFvj3cK,David Guetta;Bebe Rexha,I'm Good (Blue),I'm Good (Blue),98,175238,True,0.561,0.965,7,-3.673,0,0.0343,0.00383,0.000007,0.371,0.304,128.040,4,dance
4,1IHWl5LamUGEuP4ozKQSXZ,Bad Bunny,Un Verano Sin Ti,Tití Me Preguntó,97,243716,False,0.650,0.715,5,-5.198,0,0.2530,0.09930,0.000291,0.126,0.187,106.672,4,latin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86090,1VCZcKzxff1EmN8eNcFVZy,The Supremes,Best R&B Tunes,If My Friends Could See Me Now/Nothing Can Sto...,0,198965,False,0.300,0.579,0,-9.583,0,0.0391,0.54000,0.000073,0.981,0.707,77.150,4,disco
86091,1G2foJrzF9F4PcLx1bpSmu,The Supremes,60s Christmas Party,Little Bright Star - Stereo,0,146000,False,0.711,0.774,3,-6.769,1,0.0564,0.54900,0.000000,0.103,0.553,115.007,4,disco
86092,03Gkwcxnqw20G27uEvIxSM,The Supremes,A Very 60s Christmas,Born Of Mary - Stereo,0,168493,False,0.445,0.364,8,-11.775,1,0.0263,0.68100,0.000000,0.331,0.127,100.755,3,disco
86093,4t8FOtt2xPRrxlhiqW1sKz,The Supremes,Christmas Soul Classics,Children's Christmas Song - Stereo,0,170506,False,0.674,0.666,10,-9.749,1,0.0381,0.39200,0.000000,0.231,0.734,115.868,4,disco


In [12]:
df['artists'] = df['artists'].apply(lambda x:x.replace(";"," "))

In [13]:
df['tags'] = df['artists'] + " " + df['album_name'] + " " + df['track_name'] + " " + df['track_genre']

In [14]:
def data_text_preprocess(total_text, ind, col):
    # Remove int values from text data as that might not be important
    if type(total_text) is not int:
        string = ""
        # replacing all special char with space
        total_text = re.sub(r'[^a-zA-Z0-9\n]', ' ', str(total_text))
        # replacing multiple spaces with single space
        total_text = re.sub(r'\s+', ' ', str(total_text))
        # bring whole text to same lower-case scale
        total_text = total_text.lower()
        
        for word in total_text.split():
            string += word + " "
        
        df[col][ind] = string

In [15]:
for index, row in df.iterrows():
    if type(row['tags']) is str:
        data_text_preprocess(row['tags'], index, 'tags')

In [16]:
df.drop(['artists', 'album_name', 'track_genre'], axis=1, inplace = True)

In [17]:
df = df.head(10000)

In [18]:
df['processed_tags'] = df['tags'].apply(lambda x : x.split())
fasttext_model = FastText(sentences=df['processed_tags'], vector_size=100, window=5, min_count=1, sg=1)

In [19]:
def sentence_to_vector(sentence, model):
    words = sentence
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0)

In [20]:
def combined_vector(df, model):
    word_to_vec = sentence_to_vector(df['processed_tags'], fasttext_model) 
    
    numerical_features = np.array([df['popularity'], df['duration_ms'], df['danceability'], df['loudness'], df['energy'], df['speechiness'], df['instrumentalness'], df['liveness'], df['valence'], df['tempo']])
    
    scaler = StandardScaler()
    normalized_numerical_features = scaler.fit_transform(numerical_features.reshape(-1, 1))

    normalized_numerical_features = normalized_numerical_features.flatten()

    final_vector = np.concatenate([word_to_vec, normalized_numerical_features])

    return final_vector

In [21]:
df['vectors'] = df.apply(lambda x: combined_vector(x, fasttext_model), axis=1)

In [22]:
vector_matrix = np.vstack(df['vectors'].values)
similar = cosine_similarity(vector_matrix)

In [23]:
def recommender(track_name):
    idx = df[df['track_name'] == track_name].index[0]
    distances = sorted(list(enumerate(similar[idx])), reverse = True, key = lambda x:x[1])
    track_list=[]
    for i in distances[1:6]:
        track_list.append(df.iloc[i[0]].track_name)
    return track_list

In [24]:
 for i in recommender("Shape of You"):
     print(i)

Perfect
Pain
Take My Breath Away - Love Theme from "Top Gun"
Rise Above This
Send Me An Angel


In [25]:
pickle.dump(df, open('music.pkl', 'wb'))
pickle.dump(similar, open('similarity.pkl', 'wb'))