In [1]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import numpy as np
from subprocess import Popen, PIPE
import re

In [2]:
lyrics = pd.read_csv('lyrics.csv', header=None, names=['Album', 'Artist', 'Line', 'NextLine', 'Song', 'Year'])
lyrics = lyrics.dropna()

In [3]:
VOWELS = r'i|ɪ|e|ɛ|æ|a|ə|ɑ|ɒ|ɔ|ʌ|o|ʊ|u|y|ʏ|ø|œ|ɐ|ɜ|ɞ|ɘ|ɵ|ʉ|ɨ|ɤ|ɯ'

In [4]:
def get_phonemes(line):
    line = re.sub(r'"', '', line)
    command = 'espeak --ipa -q "{}"'.format(line)
    process = Popen(command, shell=True, stdout=PIPE)
    output, _ = process.communicate()
    return ''.join(re.findall(VOWELS, str(output, encoding='utf-8')))[::-1]

In [5]:
def word_count(line):
    return len(str(line).split())

In [6]:
tqdm.pandas(desc="Extracting Phonemes")
lyrics['Vowels'] = lyrics['Line'].progress_apply(get_phonemes)
tqdm.pandas(desc="Extracting Word Counts")
lyrics['Length'] = lyrics['Line'].progress_apply(word_count)
print('Writing to CSV')
lyrics.to_csv('lyrics_features.csv', index=False)

Extracting Phonemes: 100%|██████████| 463713/463713 [2:49:21<00:00, 45.63it/s]   
Extracting Word Counts: 100%|██████████| 463713/463713 [00:02<00:00, 174004.66it/s]


Writing to CSV


In [32]:
df = lyrics.sort_values(by='Vowels')
df = df.drop(df[df.Vowels == ''].index)
df.to_csv('lyrics_sorted.csv', index=False)