In [4]:

#Loading required libraries.
import pandas as pd
import numpy as np
import re
import string
from pprint import pprint
from collections import Counter
import keras
import os
import json
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer




In [5]:

#Path for the directory.
glove_dir = '/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics'
embeddings_index = {}

#opening the file with pretrained words.
f = open(os.path.join(glove_dir, 'lyrics.json'))
lyrics_data_dict = json.load(f)

meta_data = pd.read_csv('/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics/combined.csv')
lyrics_data_pddf = pd.read_csv('/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics/lyrics.csv')

'''
#Writing a text file for song vocab.
keys = list(lyrics_data_dict.keys())

temp = ''
for i in keys:
  temp = temp + ' ' + lyrics_data_dict[i]

text_file = open(r"/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics/hindi_song_vocab.txt", "w+")
text_file.write(temp)
text_file.close()
'''



'\n#Writing a text file for song vocab.\nkeys = list(lyrics_data_dict.keys())\n\ntemp = \'\'\nfor i in keys:\n  temp = temp + \' \' + lyrics_data_dict[i]\n\ntext_file = open(r"/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics/hindi_song_vocab.txt", "w+")\ntext_file.write(temp)\ntext_file.close()\n'

In [6]:

#Function for creating a list ofstrings from the input. It creates different elements in the list if there is a space.
#Basically, a sentence will be converted into a list of words.
def words(text): return re.findall(r'\w+', text.lower())

#Reading the file with vocabulary.
WORDS = Counter(words(open('/content/drive/My Drive/Colab_Datasets/Hindi_Songs_Lyrics/hindi_song_vocab.txt').read()))

#Function for calculating probability of the given word to find in the vocabulary.
def P(word, N = sum(WORDS.values())): 
    "Probability of `word`."
    #WORDS[word] gives the count of word
    #N is total number of words in the vocabulary.
    return WORDS[word] / N

#Main function which should be called for word correction.
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key = P)

#Function for genetraing all possible real words from given string.
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

#Function for editing given string. This returns words which are one edit away from input string.
#The function edits1 returns a set of all the edited strings (whether words or not) that can be made with one simple edit: 
#a simple edit to a word is a deletion (remove one letter), a transposition (swap two adjacent letters), a replacement (change one letter to another) or an insertion (add a letter).
#The output of this function can be a big set. For a word of length n, there will be n deletions, n-1 transpositions, 
#26n alterations, and 26(n+1) insertions, for a total of 54n+25 (of which a few are typically duplicates)
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    #Splitting the letters of a word to form a combination.
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

#Function for generating set of words which are two edits away from input string.
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))


In [21]:

#Function for vectorization of words using bag of words.
def bow_extractor(dataframe, variable_name, ngram_range = (1,3)):
  data = []
  data_raw = list(dataframe.loc[:, variable_name])
  for i in data_raw:
    song_data = i.lower()
    data.append(song_data)
  vectorizer = CountVectorizer(min_df = 1, ngram_range = ngram_range)
  features = vectorizer.fit_transform(data)
  return vectorizer, features


#Function for creating TFIDF matrix.
def tfidf_transformer(bow_matrix):
  transformer = TfidfTransformer(norm = 'l2', smooth_idf = True, use_idf = True)
  tfidf_matrix = transformer.fit_transform(bow_matrix)
  return transformer, tfidf_matrix


#Function for vectorizing and transforming query into TFIDF smatrix.
def query_vectorization(query, vectorizer, transformer):
  query_features = vectorizer.transform([query])
  query_tfidf = transformer.transform(query_features)
  return query_tfidf, query_features


#Function for creating similarity matrix.
def cos_similarity(search_query_weights, tfidf_weights_matrix):	
  #Calculating cosine distance between documents and query.
	cosine_distance = cosine_similarity(search_query_weights, tfidf_weights_matrix)
	similarity_list = cosine_distance[0]  
	return similarity_list


#Function for suggesting most similar documents.
def most_similar(similarity_list, min_talks = 1):
	most_similar= []
	while min_talks > 0:
		tmp_index = np.argmax(similarity_list)
		most_similar.append(tmp_index)
		similarity_list[tmp_index] = 0
		min_talks -= 1
	return most_similar


#Vectorizing all songs from dataset.
bow_vectorizer, bow_features = bow_extractor(lyrics_data_pddf, 'Lyrics')
transformer, tfidf_matrix = tfidf_transformer(bow_features)


#Function for searching.
def search_song(query, number_of_results = 1):
  #Correcting the query words to nearest correct word from local vocab.
  query = correction(query)
  #Vectorizing the query.
  query_tfidf, query_features = query_vectorization(query, bow_vectorizer, transformer)
  #List of similarity with all songs.
  similarity_list = cos_similarity(query_tfidf, tfidf_matrix)
  #Result of top matching songs.
  sim_songs = most_similar(similarity_list, min_talks = number_of_results)
  #Printing all the results.
  for i in sim_songs:
    print('\n', "Song: ", meta_data['Song Name'][i])
    print("Lyricist: ", meta_data['Lyricists_1'][i])
    #print("Music Director: ", meta_data['Music_Director_1'])[i]
    print("Singer: ", meta_data['Singer_1'][i])
    print('\n', "Lyrics: ", meta_data['Lyrics'][i])
    print("-----------------------------------------------------------------------------------------", '\n')





In [22]:

#query = 'nOdha Jo Tune Mujhko\nSaans Laut Aayi\nChandni Ne Tan Pe Mere\nChaadar Bhichayee\nOdha Jo Tune Mujhko\nSaans Laut Aayi'
#query = 'ki inakaa aashiq, mein ban gayaa hoon'
#query = 'Dil ki diwali hai ishq'
search_song(query = 'mere dil ka legaye', number_of_results = 1)



 Song:  Tere Mast Mast Do Nain
Lyricist:  Faaiz Anwaar
Singer:  Rahat Fateh Ali Khan

 Lyrics:  

Taakte rehte tujhko saanjh savere
Nainon mein haye
Nainon mein haye...

Taakte rehte tujhko saanjh savere
Nainon mein basiya jaise nain yeh tere
Nainon mein basiya jaise nain yeh tere
Tere mast mast do nain
Mere dil ka le gaye chain
Mere dil ka le gaye chain
Tere mast mast do nain
[Tere mast mast do nain
Mere dil ka le gaye chain
Mere dil ka le gaye chain
Tere mast mast do nain]

Pehle pehal tujhe dekha toh dil mera
Dhadka haye dhadka dhadka haye
Pehle pehal tujhe dekha toh dil mera
Dhadka haye dhadka dhadka haye

Jal jal utha hoon main shola jo pyaar ka
Bhadka haye bhadka bhadka haye
Neendhon mein ghul gaye hain sapne jo tere
Badle se lag rahe hain andaaz mere
Badle se lag rahe hain andaaz mere

Tere mast mast do nain
Mere dil ka le gaye chain
Mere dil ka le gaye chain
Tere mast mast do nain
[Tere mast mast do nain
Mere dil ka le gaye chain
Mere dil ka le gaye chain
Tere mast mast do nai

In [23]:

#Combining all songs in one string.
combined_text = ''
for song in lyrics_data_pddf['Lyrics']:
  combined_text = combined_text + song

combined_text = combined_text.lower()

len(combined_text)


1694152

In [24]:

#Unique words from the dataset.
unique_alpha = sorted(list(set(combined_text)))
uchar_index = dict((c, i) for i, c in enumerate(unique_alpha))
index_uchar = dict((i, c) for i, c in enumerate(unique_alpha))



In [25]:

SEQUENCE_LENGTH = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(combined_text) - SEQUENCE_LENGTH, step):
    sentences.append(combined_text[i: i + SEQUENCE_LENGTH])
    next_chars.append(combined_text[i + SEQUENCE_LENGTH])
print(f'num training examples: {len(sentences)}')


num training examples: 564704


In [28]:


X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(unique_alpha)), dtype = np.bool)
y = np.zeros((len(sentences), len(unique_alpha)), dtype = np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, uchar_index[char]] = 1
    y[i, uchar_index[next_chars[i]]] = 1



In [30]:
sentences[20]

'tha sa ishq laage\nkadvi judayi\nyaar mera'

In [34]:
combined_text.strip()



In [None]:

#Function for creating weight matrix.
def tf_idf(search_keys, dataframe, label):
  
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_weights_matrix = tfidf_vectorizer.fit_transform(dataframe.loc[:, label])
	search_query_weights = tfidf_vectorizer.transform([search_keys])	
	return search_query_weights, tfidf_weights_matrix

#Function for creating similarity matrix.
def cos_similarity(search_query_weights, tfidf_weights_matrix):
	
	cosine_distance = cosine_similarity(search_query_weights, tfidf_weights_matrix)
	similarity_list = cosine_distance[0]  
	return similarity_list

#Function for suggesting most similar documents.
def most_similar(similarity_list, min_talks = 1):
	
	most_similar= []
	while min_talks > 0:
		tmp_index = np.argmax(similarity_list)
		most_similar.append(tmp_index)
		similarity_list[tmp_index] = 0
		min_talks -= 1
	return most_similar



#Testing the functions!!!
search_keys = 'ki inakaa aashiq, mein ban gayaa hoon mujhako basaa le, iname tu mujhase yeh har ghadi, meraa dil kahe tum hi ho usaki aarzoo mujhase yeh har ghadi, mere lab kahe teri hi ho sab guftagoo baatein teri itni haseen, main yaad inko jab kartaa hoon phoolon si aaye, khushaboo'
dataframe = lyrics_data_pddf
label = 'Lyrics'

search_query_weights, tfidf_weights_matrix = tf_idf(search_keys, dataframe, label)
similarity_list = cos_similarity(search_query_weights, tfidf_weights_matrix)
most_similar(similarity_list, min_talks = 10)



