# Importing dependencies

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


# Read CSV

In [2]:
artist = pd.read_csv('artists.csv')
artist.head()

Unnamed: 0,id_artists,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,,Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,,ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,,Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,,Tragruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,,Ioannis Panoutsopoulos,0


In [3]:
artist.isna().sum()

id_artists         0
followers         11
genres        856500
name               3
popularity         0
dtype: int64

In [4]:
songs = pd.read_csv('tracks.csv')
songs.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


#### Cleaning data by removing null values

In [5]:
songs.isna().sum()

id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists             71
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64

In [6]:
artist.dropna(inplace=True)
artist.isna().sum()

id_artists    0
followers     0
genres        0
name          0
popularity    0
dtype: int64

In [7]:
songs.dropna(inplace=True)
songs.isna().sum()

id                  0
name                0
popularity          0
duration_ms         0
explicit            0
artists             0
id_artists          0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
dtype: int64

##### Dropping unnecessary columns

In [8]:
columns_to_drop = ['release_date', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                   'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
                   'valence', 'tempo', 'time_signature']

songs = songs.drop(columns=columns_to_drop)

songs.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su


In [9]:
artist.shape


(305589, 5)

In [10]:
songs.shape

(586601, 7)

Combining the data

In [11]:
df = pd.merge(artist, songs, left_on='id_artists', right_on='id_artists', how='inner')
df.tail()

Unnamed: 0,id_artists,followers,genres,name_x,popularity_x,id,name_y,popularity_y,duration_ms,explicit,artists
432223,3ts767KZu2G9NuvOUdGoaE,141.0,zolo,Rascal Reporters,2,2tQ2WhLWGacmIgdc8Crx0o,For the Years 2002 and 2003 Which Were Lost to Me,0,185754,0,Rascal Reporters
432224,55AlphE9y02SH7btUJJfQF,6263.0,"geek folk, neo-pagan",Heather Alexander,32,4VtNnhhAdqsveft3dFLrH5,March Of Cambreadth,40,287067,0,Heather Alexander
432225,10aXVE8RSUCeMzaFvBnZ2i,23145.0,"alternative metal, industrial metal, industria...",Pitchshifter,38,4GtKwWPKlVWrhmCmOGFb51,Genius,48,246507,0,Pitchshifter
432226,6ok7bEDf9CZ0448D59AaNL,48046.0,"pop punk, tulsa indie",Ludo,48,0YWpTcWjDsbogUMjkE1PiJ,Love Me Dead,42,260533,0,Ludo
432227,6ok7bEDf9CZ0448D59AaNL,48046.0,"pop punk, tulsa indie",Ludo,48,3IlYkyctlcTqXAZOA4ZPzz,Love Me Dead,54,260533,0,Ludo


## Data Preprocessing

#### creating new feature combining title and artist name

In [12]:
df['song'] = df['name_x']+' - '+df['name_y']
# df['song'] = df['name_y'] + '(' + df['genres'] + ')'
df.head()

Unnamed: 0,id_artists,followers,genres,name_x,popularity_x,id,name_y,popularity_y,duration_ms,explicit,artists,song
0,72578usTM6Cj5qWsi471Nc,248568.0,"filmi, indian folk, indian rock, kannada pop",Raghu Dixit,52,0WzODQag2H6Ob1JPOpSYSM,Eno Ide,29,342230,0,Raghu Dixit,Raghu Dixit - Eno Ide
1,72578usTM6Cj5qWsi471Nc,248568.0,"filmi, indian folk, indian rock, kannada pop",Raghu Dixit,52,4uqs4He9JdU4ztAQtsgmyF,Ee Tanuvu Ninnade,28,310753,0,Raghu Dixit,Raghu Dixit - Ee Tanuvu Ninnade
2,72578usTM6Cj5qWsi471Nc,248568.0,"filmi, indian folk, indian rock, kannada pop",Raghu Dixit,52,555NYWQ1PbmLJiTW6XFwtr,Munjaane Manjalli,42,340245,0,Raghu Dixit,Raghu Dixit - Munjaane Manjalli
3,72578usTM6Cj5qWsi471Nc,248568.0,"filmi, indian folk, indian rock, kannada pop",Raghu Dixit,52,7yhvflcXyqw1pQgCSnelZ4,Gudugudiya Sedi Nodo,38,295010,0,Raghu Dixit,Raghu Dixit - Gudugudiya Sedi Nodo
4,72578usTM6Cj5qWsi471Nc,248568.0,"filmi, indian folk, indian rock, kannada pop",Raghu Dixit,52,6J8qcTesEfzpGhZjBa6kfV,Ambar,38,476241,0,Raghu Dixit,Raghu Dixit - Ambar


In [13]:
df.sample(250000, replace=True).to_csv('newdataset.csv')

Taking random 30,000 samples

In [14]:
df = pd.read_csv('newdataset.csv')

Dropping duplicates

In [15]:
df.drop_duplicates(inplace=True)

In [16]:
df = df.head(30000)

In [17]:
song_grouped = df.groupby(['song']).agg({'genres':'first'}).reset_index()
song_grouped.head()

Unnamed: 0,song,genres
0,"""Weird Al"" Yankovic - Hooked On Polkas","comedy rock, comic, parody"
1,"""Weird Al"" Yankovic - White & Nerdy (Parody of...","comedy rock, comic, parody"
2,$uicideboy$ - Carrollton,"dark trap, new orleans rap, underground hip hop"
3,$uicideboy$ - Paris,"dark trap, new orleans rap, underground hip hop"
4,(G)I-DLE - Uh-Oh,"k-pop, k-pop girl group"


## Creating recommendation engin

Create a CountVectorizer and fit it to the combined features

In [18]:
vectorizer = CountVectorizer().fit(df['song'])

Transform the combined features into a matrix of token counts

In [19]:
count_matrix = vectorizer.transform(df['song'])

Compute the cosine similarity matrix from the count matrix

In [20]:
cosine_sim = cosine_similarity(count_matrix)

In [21]:
df[df['song'] == 'Eminem - Lose Yourself']

Unnamed: 0.1,Unnamed: 0,id_artists,followers,genres,name_x,popularity_x,id,name_y,popularity_y,duration_ms,explicit,artists,song


In [22]:
cosine_sim[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [23]:
def recommend_songs(title):
    if title not in df['song'].values:
        return 'Song not found in the dataset'
    idx = df[df['song'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    song_indices = [i[0] for i in sim_scores[1:21]]
    return df['song'].iloc[song_indices]


# Printing Recommendation

In [26]:
recommend_songs(title='Mohammed Rafi - Tu Hindu Banega Na Musalman')

6974     Mohammed Rafi - Man Re Tu Kahe Na Dheer Dhare
2077      Mohammed Rafi - Bachpan Ke Din Bhula Na Dena
10385    Mohammed Rafi - Na Kisi Ki Aankh Ka Noor Hoon
27054                     Mohammed Rafi - Aye Gulbadan
620                                 Buty - Na Na Na Na
10731                         One Direction - Na Na Na
21762                   Kabah - Esperanto (Na, Na, Na)
1431          Mohammed Rafi - Kalikaal Karal Mahasagar
26666     Mohammed Rafi - Gori Zara Hans De Tu Hans De
422            Mohammed Rafi - Lal Chhadi Maidan Khadi
1950           Mohammed Rafi - Aasman Se Aaya Farishta
3228             Mohammed Rafi - Chaudhvin Ka Chand Ho
3234             Mohammed Rafi - Dekhi Zamane Ki Yaari
7039            Mohammed Rafi - Yahan Main Ajnabi Hoon
7463                 Mohammed Rafi - Apni Azadi Ko Hum
10373           Mohammed Rafi - Tumse Achchha Kaun Hai
11219           Mohammed Rafi - Mere Mehboob Kahin Aur
14481        Mohammed Rafi - Maine Rakkha Hai Mohabbat
21273     

In [27]:
import pickle
pickle.dump(cosine_sim,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))