# **Youtube Bollywood Music Recommend System**

#### **Importing Dependencies**

In [220]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

#### **Reading the Dataset**

In [198]:
songs=pd.read_csv('../Dataset/Bollywood-Songs-Dataset(2017-23).csv')

In [199]:
songs.head()

Unnamed: 0,music_name,singer,release,lyrics,thumbnail
0,Ek Haseena Thi Ek Deewana Tha,Yasser Desai,2017,Ek haseena thi ek deewana tha Kya kahun tumse ...,https://i.ytimg.com/vi/6obiArkHwAk/hqdefault.jpg
1,Hue Bechain,"Palak Muchhal, Yasser Desai",2017,Hue bechain pehli bar hmny raaz ye jana Mohabb...,https://i.ytimg.com/vi/6obiArkHwAk/hqdefault.jpg
2,Hanste Hanste,"Palak Muchhal, Yasser Desai",2017,Hanste hanste ro diye tum Kis mushkil mein kho...,https://i.ytimg.com/vi/rVKX19fNV0A/hqdefault.jpg
3,Nain,"Palak Muchhal, Yasser Desai",2017,Kuchh sawaal pyar ke lab pe hai ruke ruke Jaad...,https://i.ytimg.com/vi/ZHbbcy7u9Rk/hqdefault.jpg
4,Aankhon Mein Aansoon,"Palak Muchhal, Yasser Desai",2017,Aankhon mein aansoo leke hoton se muskuraye Aa...,https://i.ytimg.com/vi/izP81UySf0Y/hqdefault.jpg


#### **Data Preprocessing**

In [200]:
songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   music_name  974 non-null    object
 1   singer      974 non-null    object
 2   release     974 non-null    int64 
 3   lyrics      974 non-null    object
 4   thumbnail   974 non-null    object
dtypes: int64(1), object(4)
memory usage: 38.2+ KB


In [201]:
songs['release'].value_counts()

release
2018    242
2019    200
2022    156
2020    133
2021    125
2023     51
2017     47
2014      2
1998      2
2009      2
2010      2
2006      1
2003      1
2000      1
2008      1
2015      1
2007      1
2012      1
1969      1
1986      1
1992      1
1971      1
1999      1
Name: count, dtype: int64

In [202]:
#Checking for Null values
songs.isna().sum()

music_name    0
singer        0
release       0
lyrics        0
thumbnail     0
dtype: int64

In [203]:
#Checking for duplicate values
songs.duplicated().sum()

0

In [204]:
#copy dataset into another variable
songs_modified=songs.iloc[:,:]

In [205]:
songs_modified.head()

Unnamed: 0,music_name,singer,release,lyrics,thumbnail
0,Ek Haseena Thi Ek Deewana Tha,Yasser Desai,2017,Ek haseena thi ek deewana tha Kya kahun tumse ...,https://i.ytimg.com/vi/6obiArkHwAk/hqdefault.jpg
1,Hue Bechain,"Palak Muchhal, Yasser Desai",2017,Hue bechain pehli bar hmny raaz ye jana Mohabb...,https://i.ytimg.com/vi/6obiArkHwAk/hqdefault.jpg
2,Hanste Hanste,"Palak Muchhal, Yasser Desai",2017,Hanste hanste ro diye tum Kis mushkil mein kho...,https://i.ytimg.com/vi/rVKX19fNV0A/hqdefault.jpg
3,Nain,"Palak Muchhal, Yasser Desai",2017,Kuchh sawaal pyar ke lab pe hai ruke ruke Jaad...,https://i.ytimg.com/vi/ZHbbcy7u9Rk/hqdefault.jpg
4,Aankhon Mein Aansoon,"Palak Muchhal, Yasser Desai",2017,Aankhon mein aansoo leke hoton se muskuraye Aa...,https://i.ytimg.com/vi/izP81UySf0Y/hqdefault.jpg


In [206]:
#converting release and singer into list and combine them
songs_modified['singer']=songs_modified['singer'].apply(lambda x: x.split(','))
songs_modified['release']=songs_modified['release'].apply(lambda x: str(x).split(','))

songs_modified['tags']=songs_modified['singer']+songs_modified['release']
songs_modified['tags']=songs_modified['tags'].apply(lambda x: ' '.join(x))

In [207]:
songs_modified.sample()

Unnamed: 0,music_name,singer,release,lyrics,thumbnail,tags
470,Akhiyaan Milavanga,"[Arijit Singh, Sruthy Sasidharan]",[2019],Ho sadde kol beh ja mere dil da tu haal sun le...,https://i.ytimg.com/vi/O9zBPcU5g60/hqdefault.jpg,Arijit Singh Sruthy Sasidharan 2019


In [208]:
#dropping singer release and lyrics
songs_modified.drop(columns=['singer', 'release', 'lyrics'], inplace=True)

In [209]:
songs_modified.sample(5)

Unnamed: 0,music_name,thumbnail,tags
814,Ji Huzoor,https://i.ytimg.com/vi/FwH4ruvzjxQ/hqdefault.jpg,Aditya Narayan 2022
870,Thaar Maar Thakkar Maar,https://i.ytimg.com/vi/QA0HMPwQfMA/hqdefault.jpg,Shreya Goshal 2022
917,Khairiyat,https://i.ytimg.com/vi/qTAegUy7mo4/hqdefault.jpg,Arijit Singh Mithoon 2023
776,Dholna Ve Dholna,https://i.ytimg.com/vi/FUBhMm5jL_A/hqdefault.jpg,Raj Ranjodh 2022
158,Holi Biraj Ma,https://i.ytimg.com/vi/2tL1xaMhKD4/hqdefault.jpg,Jubin Nautiyal 2018


#### **Text Processing**

In [210]:
#convert to lower case
songs_modified['tags']=songs_modified['tags'].apply(lambda x: x.lower())


In [211]:
songs_modified.sample(3)

Unnamed: 0,music_name,thumbnail,tags
65,Sehmi Hai Dhadkan,https://i.ytimg.com/vi/6BxjYvg9KMw/hqdefault.jpg,atif aslam 2018
237,Yaadon Ki Almari,https://i.ytimg.com/vi/4EfOdE4GjYU/hqdefault.jpg,palomi ghosh 2018
841,Saathiya,https://i.ytimg.com/vi/X0Akv2AYvbw/hqdefault.jpg,zahrah s. khan nikhil d'souza 2022


In [212]:
#vectorization
cv=CountVectorizer()
vector=cv.fit_transform(songs_modified['tags']).toarray()

In [213]:
vector.shape

(974, 840)

#### **Calculating Cosine Similarity**

In [214]:
similarity=cosine_similarity(vector)

In [215]:
similarity[3]

array([0.77459667, 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.16903085, 0.25819889, 0.25819889, 0.13483997,
       0.25819889, 0.25819889, 0.25819889, 0.25819889, 0.25819889,
       0.25819889, 0.25819889, 0.25819889, 0.2       , 0.31622777,
       0.25819889, 0.16903085, 0.6       , 0.25819889, 0.6       ,
       0.25819889, 0.25819889, 0.25819889, 0.25819889, 0.2       ,
       0.25819889, 0.25819889, 0.47434165, 0.1490712 , 0.2236068 ,
       0.25819889, 0.25819889, 0.16903085, 0.31622777, 0.31622777,
       0.2       , 0.25819889, 0.25819889, 0.25819889, 0.25819889,
       0.25819889, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.4       , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

#### **Recommendation**

In [227]:
def recommend(song):
    song_index=songs[songs['music_name']==song].index[0]
    distances=similarity[song_index]
    song_list=sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])
    song_details={
        'music_name':[],
        'release':[],
        'singer':[],
        'thumbnail':[]
    }
    c=0
    for i in song_list:
        if(i[1]>=0.5 and i[1]!=1 and c<5):
            c+=1
            song_details['music_name'].append(songs.iloc[i[0]].music_name)
            song_details['release'].append(songs.iloc[i[0]].release)
            song_details['singer'].append(songs.iloc[i[0]].singer)
            song_details['thumbnail'].append(songs.iloc[i[0]].thumbnail)
    return song_details

In [228]:
recommend('Saathiya')

{'music_name': ['Saathiya', 'Ishq Kamaal', 'Guzarish', 'Tu Dis Da', 'Angana'],
 'release': [2022, 2020, 2014, 2022, 2020],
 'singer': ['Javed Ali',
  'Javed Ali',
  'Javed Ali',
  'Hamid Ali Naqeebi',
  'Shreya Ghoshal, Javed Ali'],
 'thumbnail': ['https://i.ytimg.com/vi/O79ciJI312g/hqdefault.jpg',
  'https://i.ytimg.com/vi/iBSTR8HpjNA/hqdefault.jpg',
  'https://i.ytimg.com/vi/ztPa6vkM-yY/hqdefault.jpg',
  'https://i.ytimg.com/vi/_8JiKTGsiP4/hqdefault.jpg',
  'https://i.ytimg.com/vi/DUaUlIXQt38/hqdefault.jpg']}

#### **Extract Similarity Array**

In [226]:
with open('../Pickle/similarity.pkl', 'wb') as file:
    pickle.dump(similarity, file)