In [116]:

#Loading required libraries.
import pandas as pd
import numpy as np
import re
import string
from pprint import pprint
from collections import Counter
import keras
import os
import json
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer




In [117]:
#Path for the directory.
glove_dir = '/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics'
embeddings_index = {}

#opening teh file with pretrained words.
f = open(os.path.join(glove_dir, 'lyrics.json'))

lyrics_data_dict = json.load(f)

meta_data = pd.read_csv('/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics/combined.csv')
lyrics_data_pddf = pd.read_csv('/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics/lyrics.csv')




In [118]:


#Function for vectorization of words using bag of words.
def bow_extractor(dataframe, variable_name, ngram_range = (1,3)):
  vectorizer = CountVectorizer(min_df = 1, ngram_range = ngram_range)
  features = vectorizer.fit_transform(dataframe.loc[:, variable_name])
  return vectorizer, features


#Function for creating TFIDF matrix.
def tfidf_transformer(bow_matrix):
  transformer = TfidfTransformer(norm = 'l2', smooth_idf = True, use_idf = True)
  tfidf_matrix = transformer.fit_transform(bow_matrix)
  return transformer, tfidf_matrix


#Function for vectorizing and transforming query into TFIDF smatrix.
def query_vectorization(query, vectorizer, transformer):
  query_features = vectorizer.transform([query])
  query_tfidf = transformer.transform(query_features)
  return query_tfidf, query_features


#Function for creating similarity matrix.
def cos_similarity(search_query_weights, tfidf_weights_matrix):	
  #Calculating cosine distance between documents and query.
	cosine_distance = cosine_similarity(search_query_weights, tfidf_weights_matrix)
	similarity_list = cosine_distance[0]  
	return similarity_list


#Function for suggesting most similar documents.
def most_similar(similarity_list, min_talks = 1):
	most_similar= []
	while min_talks > 0:
		tmp_index = np.argmax(similarity_list)
		most_similar.append(tmp_index)
		similarity_list[tmp_index] = 0
		min_talks -= 1
	return most_similar





In [120]:


bow_vectorizer, bow_features = bow_extractor(lyrics_data_pddf, 'Lyrics')
transformer, tfidf_matrix = tfidf_transformer(bow_features)



In [122]:

#query = 'nOdha Jo Tune Mujhko\nSaans Laut Aayi\nChandni Ne Tan Pe Mere\nChaadar Bhichayee\nOdha Jo Tune Mujhko\nSaans Laut Aayi'
query = 'ki inakaa aashiq, mein ban gayaa hoon'
query_tfidf, query_features = query_vectorization(query, bow_vectorizer, transformer)
query_tfidf


similarity_list = cos_similarity(query_tfidf, tfidf_matrix)
sim_songs = most_similar(similarity_list, min_talks = 10)

print(sim_songs)
for i in sim_songs:
  print('\n', "Song: ", meta_data['Song Name'][i])
  print("Lyricist: ", meta_data['Lyricists_1'][i])
  #print("Music Director: ", meta_data['Music_Director_1'])[i]
  print("Singer: ", meta_data['Singer_1'][i])
  print('\n', "Lyrics: ", meta_data['Lyrics'][i])
  print("-----------------------------------------------------------------------------------------", '\n')



[5, 1168, 670, 265, 382, 4, 1114, 1094, 1238, 744]

 Song:  Maula Mere Maula Mere
Lyricist:  -
Singer:  Roop Kumar Rathod

 Lyrics:  Maula mere, maula mere 
maula mere, maula mere - 4
aankhein teri - 2, kitni haseen
ki inkaa aashiq, mein ban gayaa hoon
mujhako basaa le, iname tu
(ishq hai
maula mere, maula mere maula mere, maula mere - 2) - 3

ki inakaa aashiq, mein ban gayaa hoon
mujhako basaa le, iname tu

mujhase yeh har ghadi, meraa dil kahe
tum hi ho usaki aarzoo
mujhase yeh har ghadi, mere lab kahe
teri hi ho sab guftagoo
baatein teri itni haseen, main yaad inko jab kartaa hoon
phoolon si aaye, khushaboo

rakh loon chhupaa ke main kahin tujhako
saayaa bhi teraa naa main doon
rakh loon banaa ke kahin ghar, main tujhe
saath tere, main hi rahoon
julfen teri, itni ghani
dekh ke inako, yeh sochataa hoon
saaye me, inake main jiyoon
(ishq hai
maula mere, maula mere maula mere, maula mere - 2) - 3

(meraa dil yahi bolaa, meraa dil yahi bolaa,
yaara raaj yeh usane hai mujh par kholaa
ki h

In [112]:

#meta_data
num = 13
print(lyrics_data_pddf['Lyrics'][num])
print(meta_data['Song Name'][num])


Ab duniya se shikwa nahi hai
Na raha ab zindagi se gila
Kis ka karam hai, kis ki dua hai
Ya naseebon se tu mujh ko mila
Ab jo mile ho, door na jaana
Tere bina dil lagta nahi
Mera jeena hai kya, marna hai kya, jab saath tera nahi
Paake rab kya karun, jab tu hi mera nahi
Mera jeena hai kya, marna hai kya, jab saath tera nahi
Paake rab kya karun, jab tu hi mera nahi

I never gonna live without you baby now
You'll be only one who'll always be my fantasy
You'll be only one to make me reach my ecstacy
I don't want nobody else but you
I'm lovin you, I'm lovin you

Berang se din the, tanha thi raatein
Kuch na badalta, jo tum na aate
Berang se din the, tanha thi raatein
Kuch na badalta, jo tum na aate
Kehne ko yun toh rishte kayi hain
Rishton mein koi apna nahi
Mera jeena hai kya, marna hai kya, jab saath tera nahi
Paake rab kya karun, jab tu hi mera nahi
Mera jeena hai kya, marna hai kya, jab saath tera nahi
Paake rab kya karun, jab tu hi mera nahi

Kyun chuph gaya hai, akhiyon ko meechay
Shar

In [None]:

#Function for creating weight matrix.
def tf_idf(search_keys, dataframe, label):
  
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_weights_matrix = tfidf_vectorizer.fit_transform(dataframe.loc[:, label])
	search_query_weights = tfidf_vectorizer.transform([search_keys])	
	return search_query_weights, tfidf_weights_matrix

#Function for creating similarity matrix.
def cos_similarity(search_query_weights, tfidf_weights_matrix):
	
	cosine_distance = cosine_similarity(search_query_weights, tfidf_weights_matrix)
	similarity_list = cosine_distance[0]  
	return similarity_list

#Function for suggesting most similar documents.
def most_similar(similarity_list, min_talks = 1):
	
	most_similar= []
	while min_talks > 0:
		tmp_index = np.argmax(similarity_list)
		most_similar.append(tmp_index)
		similarity_list[tmp_index] = 0
		min_talks -= 1
	return most_similar



#Testing the functions!!!
search_keys = 'ki inakaa aashiq, mein ban gayaa hoon mujhako basaa le, iname tu mujhase yeh har ghadi, meraa dil kahe tum hi ho usaki aarzoo mujhase yeh har ghadi, mere lab kahe teri hi ho sab guftagoo baatein teri itni haseen, main yaad inko jab kartaa hoon phoolon si aaye, khushaboo'
dataframe = lyrics_data_pddf
label = 'Lyrics'

search_query_weights, tfidf_weights_matrix = tf_idf(search_keys, dataframe, label)
similarity_list = cos_similarity(search_query_weights, tfidf_weights_matrix)
most_similar(similarity_list, min_talks = 10)



