In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from main.choices import genre_weightage, tags_weightage, instrument_weightage, feature_weights
import numpy as np
import regex as re
from unidecode import unidecode
from main.choices import mdata_outliers, normalizing_factor, balancing_factor, normalizing_factor_meta
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
def get_similar_songs_id(id):
    df = pd.read_csv('data/songs_db.csv')
    df_features = pd.read_csv('data/song_dataset_final.csv')
    meta_df = pd.read_csv('data/metadata.csv')
    df['year'] = meta_df['year']
    merged_df = pd.merge(df, df_features, on='song name', how='inner')
    
    genre_columns = [col for col in merged_df.columns if col.startswith("Genre")]
    tag_columns = [col for col in merged_df.columns if col.startswith("Tag")]
    instrument_columns = [col for col in merged_df.columns if col.startswith("Instrument")]
    
    X = merged_df.iloc[:, 3:-2].copy()
    X['year'] = X['year'].replace(0, np.nan).fillna(X['year'].median())
    minmax = MinMaxScaler(feature_range=(0,1))
    X.iloc[:,:14] = minmax.fit_transform(X.iloc[:,:14])

    X['voice_male'] = (1 - X['voice_female']) * X['overall_voice']
    X['voice_female'] = X['voice_female'] * X['overall_voice']

    for key in feature_weights:
        X[key] = X[key] * feature_weights[key]
    for col in genre_columns:
        X[col] = X[col] * genre_weightage
    for col in instrument_columns:
        X[col] = X[col] * instrument_weightage
    for col in tag_columns:
        X[col] = X[col] * tags_weightage

    df_cosine = pd.DataFrame(cosine_similarity(X, dense_output=True))
    df_cosine = df_cosine.applymap(lambda x: np.power(x, normalizing_factor))
    indices = pd.Series(merged_df.index, index = merged_df['ID'])

    def remove_punct(text):
        return unidecode(re.sub(r'[^\w\s\,]', '', text.lower())) if str(text) != 'nan' else ''

    def remove_outliers_and_extra_space(text):
        for substring in mdata_outliers:
            text = text.replace(substring, '')
        clean_text = re.sub(r'\s+', ' ', text).strip()
        return clean_text
    
    artists = meta_df['artist'].apply(lambda x: remove_punct(x).split(', '))
    albums = meta_df['album'].apply(lambda x: remove_punct(x).split(', '))
    album_artist = meta_df['album artist'].apply(lambda x: remove_punct(x).split(', '))
    
    meta_df['artists_album'] = artists + album_artist + albums
    meta_df['artists_album'] = meta_df['artists_album'].apply(lambda x: [remove_outliers_and_extra_space(i) for i in x])
    meta_df['artists_album'] = meta_df['artists_album'].apply(lambda x: list(set(x)))
    meta_df['artists_album_final'] = meta_df['artists_album'].apply(lambda x: " ".join([text.replace(" ", "_") for text in x]))

    tfidf = TfidfVectorizer(stop_words = "english")
    tfidf_matrix = tfidf.fit_transform(meta_df['artists_album_final'])

    df_cosine_meta = pd.DataFrame(cosine_similarity(tfidf_matrix, tfidf_matrix))
    df_cosine_meta = df_cosine_meta.applymap(lambda x: np.power(x, normalizing_factor_meta) * balancing_factor)
    resultant_cosine = df_cosine.add(df_cosine_meta)

    index = indices[id]
    similarity_scores = list(enumerate(resultant_cosine[index]))
    similarity_scores = sorted(similarity_scores, key = lambda x: x[1],reverse = True)
    similarity_scores = similarity_scores[1:16]
    res_indices = [i[0] for i in similarity_scores]
    return merged_df['ID'].iloc[res_indices[:]].to_list()

In [51]:
df = pd.read_csv('data/songs_db.csv')

In [52]:
df['sort'] = pd.Categorical(df['ID'], categories=get_similar_songs_id(1841), ordered=True)
df_sorted = df.sort_values('sort')
df_sorted[:15]

Unnamed: 0,ID,song name,file_path,sort
53,1801,Hamari Adhuri Kahani - Arijit Singh,audio/Hamari_Adhuri_Kahani_-_Arijit_Singh.m4a,1801
168,1854,Tum Hi Ho - Arijit Singh,audio/Tum_Hi_Ho_-_Arijit_Singh.m4a,1854
174,1880,Uska Hi Banana - Arijit Singh,audio/Uska_Hi_Banana_-_Arijit_Singh.m4a,1880
98,1778,"Meri Aashiqui - Palak Muchhal, Arijit Singh",audio/Meri_Aashiqui_-_Palak_Muchhal_Arijit_Sin...,1778
60,1717,"Humdard - Mithoon, Arijit Singh",audio/Humdard_-_Mithoon_Arijit_Singh.m4a,1717
21,1762,"Chahun Main Ya Naa - Palak Muchhal, Arijit Singh",audio/Chahun_Main_Ya_Naa_-_Palak_Muchhal_Ariji...,1762
58,1788,"Hum Mar Jayenge - Tulsi Kumar, Arijit Singh",audio/Hum_Mar_Jayenge_-_Tulsi_Kumar_Arijit_Sin...,1788
48,1900,"Gerua - Pritam, Arijit Singh, Antara Mitra",audio/Gerua_-_Pritam_Arijit_Singh_Antara_Mitra...,1900
145,1860,"Sawan Aaya Hai - Tony Kakkar, Mithoon, Arijit ...",audio/Sawan_Aaya_Hai_-_Tony_Kakkar_Mithoon_Ari...,1860
143,1867,"Sanam Re - Arijit Singh, Mithoon",audio/Sanam_Re_-_Arijit_Singh_Mithoon.m4a,1867


# Metadata Similarity

In [7]:
mdf = pd.read_csv('data/metadata.csv')

In [52]:
def remove_punct(text):
    return unidecode(re.sub(r'[^\w\s\,]', '', text.lower())) if str(text) != 'nan' else ''

artists = mdf['artist'].apply(lambda x: remove_punct(x).split(', '))
albums = mdf['album'].apply(lambda x: remove_punct(x).split(', '))
album_artist = mdf['album artist'].apply(lambda x: remove_punct(x).split(', '))

In [54]:
def remove_outliers_and_extra_space(text):
    for substring in mdata_outliers:
        text = text.replace(substring, '')
    clean_text = re.sub(r'\s+', ' ', text).strip()
    return clean_text

In [55]:
mdf['artists_album'] = artists + album_artist + albums
mdf['artists_album'] = mdf['artists_album'].apply(lambda x: [remove_outliers_and_extra_space(i) for i in x])
mdf['artists_album'] = mdf['artists_album'].apply(lambda x: list(set(x)))
mdf['artists_album_final'] = mdf['artists_album'].apply(lambda x: " ".join([text.replace(" ", "_") for text in x]))

In [56]:
tfidf = TfidfVectorizer(stop_words = "english")
tfidf_matrix = tfidf.fit_transform(mdf['artists_album_final'])

similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
tweaked_similarity = np.multiply(np.power(similarity, normalizing_factor), balancing_factor)
indices = pd.Series(mdf.index, index = mdf['title'])

In [50]:
similarity2 = pd.DataFrame(cosine_similarity(tfidf_matrix, tfidf_matrix))
tweaked_similarity = similarity2.applymap(lambda x: np.power(x, normalizing_factor)*balancing_factor)

In [57]:
def song_rec_sys(name, similarity = similarity):
    index = indices[name]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key = lambda x: x[1],reverse = True)
    similarity_scores = similarity_scores[1:11]
    res_indices = [i[0] for i in similarity_scores]
    return pd.concat([mdf['title'].iloc[res_indices], pd.Series([i[1] for i in similarity_scores], index=res_indices)], axis=1)

In [58]:
song_rec_sys("Sanam Re", tweaked_similarity)

Unnamed: 0,title,0
5,Agar Tum Saath Ho,0.386051
61,Humdard,0.344676
145,Sawan Aaya Hai,0.29294
54,Hamari Adhuri Kahani,0.271277
60,Hum Naa Rahein Hum,0.231665
168,Tum Hi Ho,0.214178
184,Zaroorat,0.210127
174,Uska Hi Banana,0.202296
22,Chahun Main Ya Naa,0.177882
98,Meri Aashiqui,0.177882


# Breakdown

In [2]:
df = pd.read_csv('data/songs_db.csv')
df_features = pd.read_csv('data/song_dataset_final.csv')
merged_df = pd.merge(df, df_features, on='song name', how='inner')

In [3]:
genre_columns = [col for col in merged_df.columns if col.startswith("Genre")]
tag_columns = [col for col in merged_df.columns if col.startswith("Tag")]
instrument_columns = [col for col in merged_df.columns if col.startswith("Instrument")]

In [4]:
X = merged_df.iloc[:, 3:-2].copy()
minmax = MinMaxScaler(feature_range=(0,1))
X.iloc[:,:13] = minmax.fit_transform(X.iloc[:,:13])

In [5]:
X['voice_male'] = (1- X['voice_female']) * X['overall_voice']
X['voice_female'] = X['voice_female'] * X['overall_voice']

In [6]:
for key in feature_weights:
    X[key] = X[key] * feature_weights[key]
for col in genre_columns:
    X[col] = X[col] * genre_weightage
for col in instrument_columns:
    X[col] = X[col] * instrument_weightage
for col in tag_columns:
    X[col] = X[col] * tags_weightage

In [162]:
df_cosine=pd.DataFrame(cosine_similarity(X, dense_output=True))
indices = pd.Series(merged_df.index, index = merged_df['ID'])

In [173]:
def get_similar_songs_id(id):
    index = indices[id]
    similarity_scores = list(enumerate(df_cosine[index]))
    similarity_scores = sorted(similarity_scores, key = lambda x: x[1],reverse = True)
    similarity_scores = similarity_scores[1:16]
    res_indices = [i[0] for i in similarity_scores]
    return merged_df['song name'].iloc[res_indices[:]].to_list()

In [174]:
get_similar_songs_id(706)

['Helig - John Lunn, Eivør',
 'Destiny Is All - John Lunn, Eivør',
 'My England - John Lunn, Eivør',
 'Space Song (Beach House Cover) - Alles Club',
 'Goodbye Brother - Ramin Djawadi',
 'Lívstræðrir - John Lunn, Eivør',
 'These Are My Feelings - MrFIRA',
 'Goodbye - Apparat',
 'Chop Shop - Tyler Bates, Joel J Richard',
 'No Time for Caution - Hans Zimmer',
 "Don't Let Me Go - Cigarettes After Sex",
 'Mehabooba - Ananya Bhat',
 'Apocalypse - Cigarettes After Sex',
 'Time - Hans Zimmer',
 'Nee Himamazhayayi - Kailas Menon, K S Hari Shankar, Nithya Mammen']

# Testing individual features

In [141]:
merged_df.sort_values('voice_male', ascending=False)[['song name', 'voice_male']][:15]

Unnamed: 0,song name,voice_male
123,Oru Dinam - Anand Bhaskar,0.944735
31,"Despacito - Luis Fonsi, Daddy Yankee",0.927577
84,Lungi Dance - Yo Yo Honey Singh,0.845536
159,Take Me To Church - Hozier,0.84067
145,"Sanam Re - Arijit Singh, Mithoon",0.836867
2,"Ae Ajnabi - Udit Narayan, Mahalakshmi Iyer",0.824657
0,"1, 2, 3 (feat Jason Derulo & De La Ghetto) - S...",0.820218
38,Dura - Daddy Yankee,0.804959
125,Oru Vakkum Mindathe - Vineeth Sreenivasan,0.801271
82,Let Her Go - Passenger,0.785261


In [142]:
merged_df.sort_values('voice_male', ascending=True)[['song name', 'voice_male']][:15]

Unnamed: 0,song name,voice_male
169,Trust Each Other - Ramin Djawadi,0.0
107,"My England - John Lunn, Eivør",0.0
167,Time - Hans Zimmer,0.004259
156,Still DRE (Instrumental) - Dr Dre,0.021924
53,HITMAN 2 Soundtrack - Results Screen,0.030313
26,"Chop Shop - Tyler Bates, Joel J Richard",0.040529
51,Goodbye Brother - Ramin Djawadi,0.042135
116,No Time for Caution - Hans Zimmer,0.059489
64,I Don't Do Personals - Ramin Djawadi,0.066082
19,Can We Kiss Forever (Instrumental) - Kinà,0.068232


# Testing cosine similarity on null values

In [17]:
data = {'col 1':[1,0.5,1.5], 'col2':[0.6,0.7,0.4]}
df_test = pd.DataFrame(data)

In [20]:
df_test_scaled = minmax.fit_transform(df_test)

In [19]:
pd.DataFrame(cosine_similarity(df_test, dense_output=True))

Unnamed: 0,0,1,2
0,1.0,0.91707,0.961106
1,0.91707,1.0,0.771281
2,0.961106,0.771281,1.0


In [22]:
pd.DataFrame(cosine_similarity(df_test_scaled, dense_output=True))

Unnamed: 0,0,1,2
0,1.0,0.8,0.6
1,0.8,1.0,0.0
2,0.6,0.0,1.0
