In [1]:
import os
import sys
import string
import numpy as np
import pandas as pd
from tensorflow import keras
from tqdm import tqdm
import pickle

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
df = pd.read_csv("Data/musicoset_songfeatures/lyrics.csv", sep='\t')

In [4]:
df.head()

Unnamed: 0,song_id,lyrics
0,3e9HZxeyfWwjeyPAMmWSSQ,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
3,3KkXRkHbMCARz0aVfEt68P,
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20404 entries, 0 to 20403
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   song_id  20404 non-null  object
 1   lyrics   19663 non-null  object
dtypes: object(2)
memory usage: 318.9+ KB


In [6]:
df.isna().sum()

song_id      0
lyrics     741
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df.reset_index(inplace=True)

In [9]:
df.drop(['index'], axis=1, inplace=True)

In [10]:
print(df.lyrics,sep='\\n\\n')

0        ['[Verse 1]\nThought I\'d end up with Sean\nBu...
1        ["[Verse 1]\nFound you when your heart was bro...
2        ['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun...
3        ["[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t...
4        ["[Intro]\nI-I-I don't want a lot for Christma...
                               ...                        
19658    ['[Verse 1: Big Boi]\nWell, it\'s the M-I-croo...
19659    ['[Intro]\nThere are times when I look above a...
19660    ["[Intro: Prodigy and Havoc]\nWord up son, wor...
19661    ["[Chorus]\nWee-ooh wim-o-weh. Wee-ooh wim-o-w...
19662    ['[Intro: Shaq]\nYo Jef, why don\'t you give m...
Name: lyrics, Length: 19663, dtype: object


* we need to remove punctuations and '[]' in this text

### Preprocessing examples that we need to do
* **1. Remove Punctuations**

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Define traslator

In [12]:
translator= str.maketrans('','', string.punctuation)

In [13]:
# example for removing punchuations
xyz="next\ Thank you,@ next\ Thank you. next Yeah, yee'"
xyz.translate(translator)

'next Thank you next Thank you next Yeah yee'

note:- see above text all punctuations are removed

* **split texts verses, Pre-chorus, and into**

In [14]:
xyz= df.lyrics[1].split('\\n\\n')
xyz[:2]

['["[Verse 1]\\nFound you when your heart was broke\\nI filled your cup until it overflowed\\nTook it so far to keep you close (Keep you close)\\nI was afraid to leave you on your own',
 "[Pre-Chorus]\\nI said I'd catch you if you fall (Fall)\\nAnd if they laugh, then fuck 'em all (All)\\nAnd then I got you off your knees\\nPut you right back on your feet\\nJust so you could take advantage of me"]

In [15]:
key= xyz[0][xyz[0].find('[') + 1:xyz[0].find(']')].strip()
key

'"[Verse 1'

we have intro square brackets which have subheading infront of them we need to drop only intro word

In [16]:
xyz= df.lyrics[19662].split('\\n\\n')
xyz[:2]

["['[Intro: Shaq]\\nYo Jef, why don\\'t you give me a hoopa beat or something\\nSomething I can go to the park to\\nYeah, there you go, allright, I like that, I like that\\nIt sound dope",
 "(Bust \\'em in the eye Shaq)"]

In [17]:
key= xyz[0][xyz[0].find('[')+1:  xyz[0].find(']')].strip()
key

"'[Intro: Shaq"

In [18]:
if ':' in key:
#     take evrything till ':' comes
    key= key[:key.find(':')]
    
key  

"'[Intro"

In [19]:
# take exrything after square brackets end
# need +1 beause skip ']' <--this
xyz[0][xyz[0].find(']')+1:]

"\\nYo Jef, why don\\'t you give me a hoopa beat or something\\nSomething I can go to the park to\\nYeah, there you go, allright, I like that, I like that\\nIt sound dope"

# Clean this text as all above examples

In [20]:
df.head()

Unnamed: 0,song_id,lyrics
0,3e9HZxeyfWwjeyPAMmWSSQ,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
3,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."
4,0bYg9bo50gSsH3LtXe2SQn,"[""[Intro]\nI-I-I don't want a lot for Christma..."


In [21]:
df.lyrics

0        ['[Verse 1]\nThought I\'d end up with Sean\nBu...
1        ["[Verse 1]\nFound you when your heart was bro...
2        ['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun...
3        ["[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t...
4        ["[Intro]\nI-I-I don't want a lot for Christma...
                               ...                        
19658    ['[Verse 1: Big Boi]\nWell, it\'s the M-I-croo...
19659    ['[Intro]\nThere are times when I look above a...
19660    ["[Intro: Prodigy and Havoc]\nWord up son, wor...
19661    ["[Chorus]\nWee-ooh wim-o-weh. Wee-ooh wim-o-w...
19662    ['[Intro: Shaq]\nYo Jef, why don\'t you give m...
Name: lyrics, Length: 19663, dtype: object

In [22]:
text=[]

In [23]:
for i in df.lyrics:
    for verse in i.split("\\n\\n"):
        text.append(verse)

In [24]:
replace_list =  ['[Verse 1]','[Verse 2]','[Verse 3]','[Verse 4]', '[Chorus]']

In [25]:
text[0]

'[\'[Verse 1]\\nThought I\\\'d end up with Sean\\nBut he wasn\\\'t a match\\nWrote some songs about Ricky\\nNow I listen and laugh\\nEven almost got married\\nAnd for Pete, I\\\'m so thankful\\nWish I could say, "Thank you" to Malcolm\\n\\\'Cause he was an angel'

In [26]:
cleaned_text= []

In [27]:
for s in tqdm(text):
   for i in replace_list:
     if i in s:
       cleaned_text += [x.lower().replace('(','').replace(')','').translate(translator) for x in s[s.find(']')+1:].split('\\n') if len(x) > 1]


100%|██████████████████████████████████████████████████████████████████████| 486430/486430 [00:01<00:00, 251227.58it/s]


In [28]:
len(cleaned_text)


237764

In [29]:
text[0:5]

['[\'[Verse 1]\\nThought I\\\'d end up with Sean\\nBut he wasn\\\'t a match\\nWrote some songs about Ricky\\nNow I listen and laugh\\nEven almost got married\\nAnd for Pete, I\\\'m so thankful\\nWish I could say, "Thank you" to Malcolm\\n\\\'Cause he was an angel',
 "[Pre-Chorus]\\nOne taught me love\\nOne taught me patience\\nAnd one taught me pain\\nNow, I\\'m so amazing\\nSay I\\'ve loved and I\\'ve lost\\nBut that\\'s not what I see\\nSo, look what I got\\nLook what you taught me\\nAnd for that, I say",
 "[Chorus]\\nThank you, next (Next)\\nThank you, next (Next)\\nThank you, next\\nI\\'m so fuckin\\' grateful for my ex\\nThank you, next (Next)\\nThank you, next (Next)\\nThank you, next (Next)\\nI\\'m so fuckin\\'—",
 "[Verse 2]\\nSpend more time with my friends\\nI ain\\'t worried \\'bout nothin\\'\\nPlus, I met someone else\\nWe havin\\' better discussions\\nI know they say I move on too fast\\nBut this one gon\\' last\\n\\'Cause her name is Ari\\nAnd I\\'m so good with that (So 

In [30]:
cleaned_text[:10]

['thought id end up with sean',
 'but he wasnt a match',
 'wrote some songs about ricky',
 'now i listen and laugh',
 'even almost got married',
 'and for pete im so thankful',
 'wish i could say thank you to malcolm',
 'cause he was an angel',
 'thank you next next',
 'thank you next next']

In [31]:
with open("data/cleaned_text.txt", 'wb') as fp:
    pickle.dump(cleaned_text, fp)

# Now we will convert text into vectors

In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical


### Tokenize the texts

In [34]:
tokenizer= Tokenizer()
tokenizer.fit_on_texts(cleaned_text)

In [35]:
tokenizer.word_index

{'i': 1,
 'you': 2,
 'the': 3,
 'to': 4,
 'and': 5,
 'a': 6,
 'me': 7,
 'my': 8,
 'it': 9,
 'in': 10,
 'that': 11,
 'on': 12,
 'im': 13,
 'your': 14,
 'of': 15,
 'love': 16,
 'all': 17,
 'be': 18,
 'dont': 19,
 'like': 20,
 'know': 21,
 'for': 22,
 'so': 23,
 'but': 24,
 'we': 25,
 'is': 26,
 'just': 27,
 'up': 28,
 'with': 29,
 'its': 30,
 'when': 31,
 'no': 32,
 'got': 33,
 'this': 34,
 'do': 35,
 'what': 36,
 'baby': 37,
 'oh': 38,
 'if': 39,
 'can': 40,
 'yeah': 41,
 'get': 42,
 'now': 43,
 'out': 44,
 'youre': 45,
 'go': 46,
 'was': 47,
 'down': 48,
 'she': 49,
 'one': 50,
 'cause': 51,
 'cant': 52,
 'never': 53,
 'they': 54,
 'time': 55,
 'want': 56,
 'say': 57,
 'way': 58,
 'see': 59,
 'make': 60,
 'let': 61,
 'her': 62,
 'come': 63,
 'back': 64,
 'aint': 65,
 'girl': 66,
 'not': 67,
 'wanna': 68,
 'take': 69,
 'have': 70,
 'are': 71,
 'how': 72,
 'at': 73,
 'ill': 74,
 'from': 75,
 'right': 76,
 'were': 77,
 'he': 78,
 'night': 79,
 'feel': 80,
 'been': 81,
 'gonna': 82,
 'need

In [36]:
frequency=tokenizer.word_counts

### we will not take words with less frequency than 7

In [37]:
MIN_FREQUENCY = 7

In [38]:
uncommon_words = set([key for key in frequency.keys() if frequency[key] < MIN_FREQUENCY])

words = sorted(set([key for key in frequency.keys() if frequency[key] >= MIN_FREQUENCY]))


In [39]:
for i in ['loop','ageless','mounted','gusto','educations','gumpin','mich',]:
    print(frequency[i])

3
3
2
1
1
1
3


**Note words will not have much impact beause they are used verry less times**m

In [40]:
VOCAB_SIZE= len(words)
VOCAB_SIZE

7233

In [41]:
word_indices = dict((w, i) for i, w in enumerate(words))

indices_word = dict((i, w) for i, w in enumerate(words))

**note save this dictionaries will need them at time of text generation**

In [42]:
with open('data/word_indices','wb') as fp:
    pickle.dump(word_indices, fp)

In [43]:
with open('data/indices_word','wb') as fp:
    pickle.dump(indices_word, fp)

#### Convert text into numbers

In [44]:
encoded_text= tokenizer.texts_to_sequences(cleaned_text)


In [45]:
print((encoded_text[:5]))

[[219, 162, 264, 28, 29, 7234], [24, 78, 542, 6, 1638], [1063, 110, 795, 95, 6671], [43, 1, 421, 5, 689], [164, 748, 33, 1298]]


### Here we will create feature and labels we took minimm sequence 4 so 4 word and next word to that will be label

In [58]:
x=[]
y=[]

In [47]:
cleaned_text[:5]


['thought id end up with sean',
 'but he wasnt a match',
 'wrote some songs about ricky',
 'now i listen and laugh',
 'even almost got married']

In [49]:
MIN_SEQ= 4

In [59]:
for d in tqdm(encoded_text):
   for i in range(MIN_SEQ, len(d)):
        # skip unccommon words using if 
        if d[i] in indices_word.keys():
            x.append(d[i-MIN_SEQ: i])
            y.append(d[i])

100%|██████████████████████████████████████████████████████████████████████| 237764/237764 [00:01<00:00, 194838.87it/s]


In [51]:
x[:5]

[[219, 162, 264, 28],
 [24, 78, 542, 6],
 [1063, 110, 795, 95],
 [43, 1, 421, 5],
 [5, 22, 7235, 13]]

In [52]:
y[:10]

[29, 1638, 6671, 689, 23, 4725, 806, 2, 4, 564]

### Save preprocesses features and labels

In [136]:
with open("Data/x.txt", 'wb') as fp:
  pickle.dump(x, fp)

In [137]:
with open("Data/x.txt", 'rb') as fp:
  loaded_x= pickle.load(fp)

In [138]:
loaded_x == x

True

In [53]:
with open("data/y.txt", 'wb') as fp:
  pickle.dump(y, fp)

In [54]:
with open("Data/y.txt", 'rb') as fp:
  loaded_y= pickle.load(fp)

In [55]:
loaded_y == y

True