In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv('data/tcc_ceds_music.csv')

In [3]:
df = df[['artist_name', 'track_name', 'genre', 'len',
       'lyrics', 'dating', 'violence', 'world/life', 'night/time',
       'shake the audience', 'family/gospel', 'romantic', 'communication',
       'obscene', 'music', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability',
       'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
       'topic']]

In [4]:
df = df[df['len'] >= 75]

In [31]:
import nltk
nltk.download('words')

from nltk.corpus import words

def is_english_word(word):
    word = word.lower()
    english_words = set(words.words())
    return word in english_words

# Example usage
word1 = 'hello'
word2 = 'hyde'

print(f'{word1} is an English word: {is_english_word(word1)}')
print(f'{word2} is an English word: {is_english_word(word2)}')


hello is an English word: True
hyde is an English word: False


[nltk_data] Downloading package words to /Users/shri/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [37]:
import nltk
from nltk import wordpunct_tokenize
from nltk.corpus import words

def is_english(row):
    # Tokenize the text into words
    tokens = wordpunct_tokenize(row['lyrics'].lower())
    
    # Get a set of English stopwords
    english_words = set(words.words())
    
    # Calculate the ratio of English words to total words
    english_word_ratio = sum(1 for word in tokens if word.lower() in english_words) / len(tokens)
    
    # If more than 50% of words are in English stopwords, classify as English
    if english_word_ratio > 0.8:
        return 'English'
    else:
        return "Not English"

In [39]:
tqdm.pandas()

In [40]:
df['language'] = df.progress_apply(is_english, axis=1)

100%|██████████| 11008/11008 [09:03<00:00, 20.25it/s]


In [59]:
df.to_csv('data/tcc_ceds_music.csv', index=False)

In [58]:
df = df[df['language'] == 'English']

In [45]:
df['language'] == 'English'

0         True
1         True
2        False
3         True
4        False
         ...  
11003    False
11004    False
11005    False
11006     True
11007    False
Name: language, Length: 11008, dtype: bool

In [None]:
chord = pd.read_csv('data/archive-3/chords_and_lyrics.csv')

In [238]:
def get_intro_data(row):
    if 'Intro' in str(row['chords&lyrics']):
        return True
    else:
        return False

In [239]:
chord['Intro'] = chord.progress_apply(get_intro_data, axis=1)

100%|██████████| 135783/135783 [00:00<00:00, 293633.70it/s]


In [240]:
chord = chord[chord['Intro']]

In [241]:
import re

def Extract_Chords(row):
  """Removes all punctuations except for space, alphanumerics, and ,."""
  allowed_chars = set("abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,./#")
  text = row['chords']
  text_without_punctuation =  ''.join([char for char in text if char in allowed_chars])
  start_index = text_without_punctuation.find("Intro") + len("Intro")
  end_condition = False
  start_condition = True
  for i in range(start_index, len(text_without_punctuation)):
    if text_without_punctuation[i] in set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") and start_condition:
        start_condition = False
        start_point = i
        end_condition = True
    if text_without_punctuation[i] in ".?!,;" and end_condition:
        end_point = i
        break
  try:
    output = re.sub(r"\s+", " ", text_without_punctuation[start_point: end_point]).strip(' ').split(' ')
  except:
    output = None
  return output


In [242]:
chord['progression'] = chord.progress_apply(Extract_Chords, axis=1)

100%|██████████| 65293/65293 [00:02<00:00, 24262.88it/s]


In [243]:
def extract_4_chords(row):
    try:
        if len(row['progression']) == 4:
            return True
        else:
            return False
    except:
        return False

In [244]:
chord['four_progression'] = chord.progress_apply(extract_4_chords, axis=1)

100%|██████████| 65293/65293 [00:00<00:00, 353153.25it/s]


In [245]:
chord = chord[chord['four_progression']]

In [246]:
chord.columns

Index(['Unnamed: 0', 'artist_name', 'song_name', 'chords&lyrics', 'chords',
       'lyrics', 'tabs', 'lang', 'artist_id', 'followers', 'genres',
       'popularity', 'name_e_chords', 'Intro', 'progression',
       'four_progression'],
      dtype='object')

In [247]:
chord['progression']

10         [F#m, A, C#m, B]
21        [E, C#m, G#m, F#]
22          [D, Bm, E, F#m]
34           [C, Em, Am, G]
44           [Am, G, F, Dm]
                ...        
135720        [B, E, A, F#]
135728        [D, C, A, 2x]
135733        [D, G, C, Am]
135753       [F#m, E, A, E]
135767       [ttG, C, G, D]
Name: progression, Length: 16872, dtype: object

In [248]:
chord['lang'].unique()

array(['en', 'es', 'id', 'et', 'af', 'it', 'pt', 'tl', 'nl', 'sw', 'so',
       'hr', 'cy', 'de', 'fi', 'fr', 'sv', 'hu', nan, 'tr', 'ca', 'da',
       'no'], dtype=object)

In [249]:
chord = chord[['lyrics', 'lang', 'genres', 'progression']]

In [250]:
chord['lang'].unique()

array(['en', 'es', 'id', 'et', 'af', 'it', 'pt', 'tl', 'nl', 'sw', 'so',
       'hr', 'cy', 'de', 'fi', 'fr', 'sv', 'hu', nan, 'tr', 'ca', 'da',
       'no'], dtype=object)

In [251]:
chord = chord[chord['lang'] == 'en']

In [252]:
chord

Unnamed: 0,lyrics,lang,genres,progression
10,"{1: '', 3: 'We were inseparable ', 5: ' Everyt...",en,"['canadian pop', 'pop', 'post-teen pop']","[F#m, A, C#m, B]"
21,"{1: '', 3: 'Show you off, tonight I wanna show...",en,"['canadian pop', 'pop', 'post-teen pop']","[E, C#m, G#m, F#]"
22,"{0: '\nCapo on 1st fret\n\t \t\t ', 1: ' ', 2...",en,"['canadian pop', 'pop', 'post-teen pop']","[D, Bm, E, F#m]"
34,"{0: '\n\t \t\t', 2: '', 4: 'Come home to me '...",en,"['canadian pop', 'pop', 'post-teen pop']","[C, Em, Am, G]"
44,"{0: '\n\t \t\t', 3: '', 4: 'Verse 1:', 5: '',...",en,"['canadian pop', 'pop', 'post-teen pop']","[Am, G, F, Dm]"
...,...,...,...,...
135720,"{2: ' ', 4: ""You don't have to take the bar ex...",en,"['alternative rock', 'athens indie', 'classic ...","[B, E, A, F#]"
135728,"{0: '\n\t \t\t', 1: '', 3: '', 5: 'Empty brea...",en,"['alternative rock', 'athens indie', 'classic ...","[D, C, A, 2x]"
135733,"{11: ' ', 13: 'This here is the place where I ...",en,"['alternative rock', 'athens indie', 'classic ...","[D, G, C, Am]"
135753,"{0: '\n\t \t\t', 2: '', 3: 'Verse: ', 4: '', ...",en,"['classic rock', 'folk', 'folk rock', 'melanch...","[F#m, E, A, E]"


In [253]:
def progression_key(row):
    return row['progression'][0]

In [254]:
chord['start_key'] = chord.progress_apply(progression_key, axis=1)

100%|██████████| 9959/9959 [00:00<00:00, 311919.96it/s]


In [256]:
chord

Unnamed: 0,lyrics,lang,genres,progression,start_key
10,"{1: '', 3: 'We were inseparable ', 5: ' Everyt...",en,"['canadian pop', 'pop', 'post-teen pop']","[F#m, A, C#m, B]",F#m
21,"{1: '', 3: 'Show you off, tonight I wanna show...",en,"['canadian pop', 'pop', 'post-teen pop']","[E, C#m, G#m, F#]",E
22,"{0: '\nCapo on 1st fret\n\t \t\t ', 1: ' ', 2...",en,"['canadian pop', 'pop', 'post-teen pop']","[D, Bm, E, F#m]",D
34,"{0: '\n\t \t\t', 2: '', 4: 'Come home to me '...",en,"['canadian pop', 'pop', 'post-teen pop']","[C, Em, Am, G]",C
44,"{0: '\n\t \t\t', 3: '', 4: 'Verse 1:', 5: '',...",en,"['canadian pop', 'pop', 'post-teen pop']","[Am, G, F, Dm]",Am
...,...,...,...,...,...
135720,"{2: ' ', 4: ""You don't have to take the bar ex...",en,"['alternative rock', 'athens indie', 'classic ...","[B, E, A, F#]",B
135728,"{0: '\n\t \t\t', 1: '', 3: '', 5: 'Empty brea...",en,"['alternative rock', 'athens indie', 'classic ...","[D, C, A, 2x]",D
135733,"{11: ' ', 13: 'This here is the place where I ...",en,"['alternative rock', 'athens indie', 'classic ...","[D, G, C, Am]",D
135753,"{0: '\n\t \t\t', 2: '', 3: 'Verse: ', 4: '', ...",en,"['classic rock', 'folk', 'folk rock', 'melanch...","[F#m, E, A, E]",F#m


In [257]:
def adjust_progression(row):
    return row['progression'][1:]

In [324]:
import re

# Input string with number: occurrences
input_string = "This is a 1: test 2: string 3: with 4: occurrences."

# Remove all occurrences of number: using regex
output_string = re.sub(r'\d+:', '', input_string)

# Print the result
print(output_string)


This is a  test  string  with  occurrences.


In [329]:
import re

# Input string with number: occurrences
input_string = lyrics

# Remove all occurrences of number: using regex
output_string = re.sub(r'\d+:', '', input_string)
output_string = output_string.split('{')[1].split('}')[0]
temp_list = output_string.split(',')


In [336]:
temp_list[0]

" '\\nCapo on 3rd fret\\n\\t  \\t\\t '"

In [258]:
chord['progression'] = chord.progress_apply(adjust_progression, axis=1)

100%|██████████| 9959/9959 [00:00<00:00, 272489.94it/s]


In [261]:
chord.to_csv('data/golden_data.csv', index=False)

In [263]:
golden_data = pd.read_csv('data/golden_data.csv')

In [314]:
unescaped_string = bytes(lyrics, 'utf-8').decode('unicode_escape')
unescaped_string

'{0: \'\nCapo on 3rd fret\n\t  \t\t \', 1: \' \', 2: \'\', 4: \'\', 5: \'verse 1 \', 7: \'  Ended up on a crossroad \', 9: \'Try to figure out which way to go \', 11: "It\'s like you\'re stuck on a treadmill ", 12: \'Running in the same place \', 14: \'  You got your hazard lights on now \', 16: \'Hoping that somebody would slow down \', 17: \'Praying for a miracle \', 19: "Who\'ll show you grace? ", 21: \'Had a couple dollars and a quarter tank of gas \', 23: \'With a long journey ahead \', 25: \'Seen a truck pull over \', 27: \'God sent an angel to help you out \', 29: \'He gave you direction \', 30: \'Showed you how to read a map \', 32: \'With a long journey ahead \', 34: "Said it ain\'t over ", 36: \'Oh, even in the midst of doubt \', 37: \'\', 38: \'\', 40: \'Life is worth living \', 42: \'Life is worth living, so live another day \', 44: \'The meaning of forgiveness \', 46: "People make mistakes, doesn\'t mean you have to give in ", 48: \' Life is worth living again \', 49: \'\'

In [1]:
import pandas as pd
golden_data = pd.read_csv('data/golden_data.csv')

In [39]:
import re
def process_lyrics(row):
    lyrics = row['lyrics']
    output_string = re.sub(r'\d+:', '', lyrics)
    output_string = output_string.split('{')[1].split('}')[0]

    stripped_list = []
    
    for string in output_string.split(','):
        process = False
        for char in string:
            if char.isalpha():
                process =  True
                break
        if process:
            stripped_list.append(string.strip().strip("'").strip('"'))
        else:
            stripped_list.append('')
    return '\n'.join(stripped_list)

In [40]:
from tqdm import tqdm
tqdm.pandas()

In [41]:
golden_data['processed_lyrics'] = golden_data.progress_apply(process_lyrics, axis=1)

100%|██████████| 9959/9959 [00:00<00:00, 20239.24it/s]


In [42]:
golden_data['processed_lyrics'][7]

"\nYou know you can call me if you need someone \nI'll pick up the pieces if you come undone \n\nPre-Chorus\nPainting stars up on your ceiling 'cause you \nWish that you could find some feeling\nyeah\nyou \nYou know you can call me if you need someone      \n\n\n\nI need you to hold on \nHeaven is a place not too far away \nWe all know I should be the one to say we all make mistakes  \n(We all make mistakes) \nTake my hand and hold on \nTell me everything that you need to say \n'Cause I know how it feels to be someone \nFeels to be someone who loses their way \n\nYou're looking for answers in a place unknown \nYou need the connection but you can't get close (Can't get close) \n\nPre-Chorus\nPainting stars up on your ceiling 'cause you \nWish that you could find some feeling\nyeah\nyou \nYou know you can call me if you need someone    \n\n\n\nI need you to hold on \nHeaven is a place not too far away \nWe all know I should be the one to say we all make mistakes  \n(We all make mistakes)

In [45]:
print(golden_data['processed_lyrics'][180])

\n\t  \t\t



verse 1 
I was good on my own
that's the way it was
that's the way it was 
You was good on the low for a faded fuck
on some faded love 
Shit
what the fuck you complaining for? Feeling jaded huh? 
Used to trip off that shit I was kickin' to you 
Had some fun on the run though I give it to you 


Pre-Chorus 
But baby
don't get it twisted 
You was just another nigga on the hit list 
Tryna fix your inner issues with a bad bitch 
Didn't they tell you that I was a savage 
Fuck your white horse and a carriage 
Bet you never could imagine 
Never told you you could have it 


You nee-deeded me 
Oooh
you nee-deeded me 
To feel a little more
and give a little less 
Know you're here to confess 
But baby who
you nee-deeded me 


verse 2 
You been rollin' around
shit I'm rollin up 
Light and roll it up 
Break it down like a pound
shit was never us 
Shit was never us 
That's the real on the real
are you serious? 
How you feel
how you feel? 
Used to trip off that shit I was kickin' to ya

In [30]:
import json

In [290]:
golden_data['lyrics'][0]

'{1: \'\', 3: \'We were inseparable \', 5: \' Everything I had to do I did it next to you \', 7: \'And the memories we made were so incredible \', 9: \'Then our love was interrupted by my schedule \', 11: \'There was nothing that I could do \', 13: \'Cause you fell into the deepest depression baby \', 15: "And I hate to know I\'m responsible ", 17: \'And your heart filled up with so much aggression baby \', 19: \'You got used to \', 20: \'\', 22: \'Being alone, alone \', 24: \'You adapted, now your use to \', 26: \'Being alone, all alone \', 28: \'Ooo you got used too being on your own \', 29: \'\', 31: "I saw it happenin\' ", 33: "But I didn\'t accept the truth, I couldn\'t fathom it ", 35: "There was so much going on you couldn\'t handle it ", 37: \'Could have divided my time, I should have fractioned it \', 39: \'But there was nothing that I could do \', 41: \'Cause you fell into the deepest depression baby \', 43: "And I hate to know I\'m responsible ", 45: \'And your heart filled 

In [272]:
print(golden_data['lyrics'][0])

{1: '', 3: 'We were inseparable ', 5: ' Everything I had to do I did it next to you ', 7: 'And the memories we made were so incredible ', 9: 'Then our love was interrupted by my schedule ', 11: 'There was nothing that I could do ', 13: 'Cause you fell into the deepest depression baby ', 15: "And I hate to know I'm responsible ", 17: 'And your heart filled up with so much aggression baby ', 19: 'You got used to ', 20: '', 22: 'Being alone, alone ', 24: 'You adapted, now your use to ', 26: 'Being alone, all alone ', 28: 'Ooo you got used too being on your own ', 29: '', 31: "I saw it happenin' ", 33: "But I didn't accept the truth, I couldn't fathom it ", 35: "There was so much going on you couldn't handle it ", 37: 'Could have divided my time, I should have fractioned it ', 39: 'But there was nothing that I could do ', 41: 'Cause you fell into the deepest depression baby ', 43: "And I hate to know I'm responsible ", 45: 'And your heart filled up with so much aggression baby ', 47: 'You 

In [291]:
data_string = golden_data['lyrics'][0]

In [294]:
golden_data['lyrics'][0].split(',')

["{1: ''",
 " 3: 'We were inseparable '",
 " 5: ' Everything I had to do I did it next to you '",
 " 7: 'And the memories we made were so incredible '",
 " 9: 'Then our love was interrupted by my schedule '",
 " 11: 'There was nothing that I could do '",
 " 13: 'Cause you fell into the deepest depression baby '",
 ' 15: "And I hate to know I\'m responsible "',
 " 17: 'And your heart filled up with so much aggression baby '",
 " 19: 'You got used to '",
 " 20: ''",
 " 22: 'Being alone",
 " alone '",
 " 24: 'You adapted",
 " now your use to '",
 " 26: 'Being alone",
 " all alone '",
 " 28: 'Ooo you got used too being on your own '",
 " 29: ''",
 ' 31: "I saw it happenin\' "',
 ' 33: "But I didn\'t accept the truth',
 ' I couldn\'t fathom it "',
 ' 35: "There was so much going on you couldn\'t handle it "',
 " 37: 'Could have divided my time",
 " I should have fractioned it '",
 " 39: 'But there was nothing that I could do '",
 " 41: 'Cause you fell into the deepest depression baby '",
 '

In [299]:
import re

def extract_text_regex(line):
  """
  Extracts text from a line using regular expressions.

  Args:
      line: The line to extract text from.

  Returns:
      The string containing the extracted text.
  """
  # Define a pattern to match only alphanumeric characters and spaces
  pattern = r"[a-zA-Z\s\-_,']+"
  # Find all occurrences of the pattern in the line
  matches = re.findall(pattern, line)
  # Join the matches to form a single string
  return ''.join(matches)

In [300]:
extract_text_regex(golden_data['lyrics'][0].split(',')[1])

"  'We were inseparable '"

In [292]:
# Split the data string by lines
data_lines = data_string.splitlines()

# Create an empty dictionary to store the formatted data
formatted_data = {}

# Loop through each line
for line in data_lines:
  # Split the line by colon (":")
  key, value = line.split(": ", 1)
  # Remove extra spaces from the value
  value = value.strip()
  # Add the key-value pair to the dictionary
  formatted_data[int(key)] = value

# Print the formatted dictionary
print(formatted_data)

ValueError: invalid literal for int() with base 10: '{1'

In [289]:
golden_data

Unnamed: 0,lyrics,lang,genres,progression,start_key
0,"{1: '', 3: 'We were inseparable ', 5: ' Everyt...",en,"['canadian pop', 'pop', 'post-teen pop']","['A', 'C#m', 'B']",F#m
1,"{1: '', 3: 'Show you off, tonight I wanna show...",en,"['canadian pop', 'pop', 'post-teen pop']","['C#m', 'G#m', 'F#']",E
2,"{0: '\nCapo on 1st fret\n\t \t\t ', 1: ' ', 2...",en,"['canadian pop', 'pop', 'post-teen pop']","['Bm', 'E', 'F#m']",D
3,"{0: '\n\t \t\t', 2: '', 4: 'Come home to me '...",en,"['canadian pop', 'pop', 'post-teen pop']","['Em', 'Am', 'G']",C
4,"{0: '\n\t \t\t', 3: '', 4: 'Verse 1:', 5: '',...",en,"['canadian pop', 'pop', 'post-teen pop']","['G', 'F', 'Dm']",Am
...,...,...,...,...,...
9954,"{2: ' ', 4: ""You don't have to take the bar ex...",en,"['alternative rock', 'athens indie', 'classic ...","['E', 'A', 'F#']",B
9955,"{0: '\n\t \t\t', 1: '', 3: '', 5: 'Empty brea...",en,"['alternative rock', 'athens indie', 'classic ...","['C', 'A', '2x']",D
9956,"{11: ' ', 13: 'This here is the place where I ...",en,"['alternative rock', 'athens indie', 'classic ...","['G', 'C', 'Am']",D
9957,"{0: '\n\t \t\t', 2: '', 3: 'Verse: ', 4: '', ...",en,"['classic rock', 'folk', 'folk rock', 'melanch...","['E', 'A', 'E']",F#m


In [288]:
print(golden_data['lyrics'][0].replace("\'", "'"))

{1: '', 3: 'We were inseparable ', 5: ' Everything I had to do I did it next to you ', 7: 'And the memories we made were so incredible ', 9: 'Then our love was interrupted by my schedule ', 11: 'There was nothing that I could do ', 13: 'Cause you fell into the deepest depression baby ', 15: "And I hate to know I'm responsible ", 17: 'And your heart filled up with so much aggression baby ', 19: 'You got used to ', 20: '', 22: 'Being alone, alone ', 24: 'You adapted, now your use to ', 26: 'Being alone, all alone ', 28: 'Ooo you got used too being on your own ', 29: '', 31: "I saw it happenin' ", 33: "But I didn't accept the truth, I couldn't fathom it ", 35: "There was so much going on you couldn't handle it ", 37: 'Could have divided my time, I should have fractioned it ', 39: 'But there was nothing that I could do ', 41: 'Cause you fell into the deepest depression baby ', 43: "And I hate to know I'm responsible ", 45: 'And your heart filled up with so much aggression baby ', 47: 'You 

In [271]:
json.loads(golden_data['lyrics'][0])

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

In [48]:
golden_data.columns

Index(['lyrics', 'lang', 'genres', 'progression', 'start_key',
       'processed_lyrics'],
      dtype='object')

In [50]:
golden_data = golden_data[['genres', 'progression', 'start_key',
       'processed_lyrics']]

In [51]:
golden_data.to_csv('data/golden_data.csv', index=False)