Parse Text Data, make Train Split
---

In [3]:
import glob
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack, vstack
import numpy as np
from sklearn.preprocessing import normalize
#import pronouncing
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag_sents
from collections import Counter

def parse_lyrics():
    parsed = {"unique_line_ratios": [], "unique_word_ratios_song": [], "artists": [],
             "titles": [], "genres": [], "raw_lyrics": [], "total_word_counts": [], 
              "average_character_count_word":[], "average-words-per-line":[],
              "count_nouns":[], "count_verbs":[], "count_adjectives":[],
              "count_pronouns":[], "count_adverbs": []}
    
    #iterate through files
    for filename in glob.iglob('data/*.txt'):
        genre = filename[5:len(filename)-4]
        print(genre)
        numberOfSongs = 0
        file = open(filename)
        lines = file.readlines()
        i = 0

        while i < len(lines) and not lines[i].startswith("###"):
            i = i + 1
        while i < len(lines):
            numberOfSongs = numberOfSongs + 1
            parsed["artists"].append(lines[i][3:].strip())
            parsed["titles"].append(lines[i+1][3:].strip())
            parsed["genres"].append(genre) 
            i = i + 2
            #extract features from song
            
            lyrics = []
            while i < len(lines) and not lines[i].startswith("###"):
                line = lines[i].strip()                
                if line != "":
                    lyrics.append(line)
                i = i + 1
                
            lyric_string = make_lyric_string(lyrics)
            
            parsed["raw_lyrics"].append(lyric_string)
            
            parsed["unique_line_ratios"].append(calc_unique_line_ratio(lyrics))
            parsed["unique_word_ratios_song"].append(calc_unique_word_ratio_song(lyric_string))
            
            parsed["total_word_counts"].append(count_words(lyric_string))
            parsed["average_character_count_word"].append(count_characters(lyric_string)/count_words(lyric_string))
            parsed["average-words-per-line"].append(count_words(lyric_string)/len(lyrics))
            
            parsed["count_nouns"].append(count_nouns(lyrics))
            parsed["count_verbs"].append(count_verbs(lyrics))
            parsed["count_adjectives"].append(count_adjectives(lyrics))
            parsed["count_pronouns"].append(count_pronouns(lyrics))
            parsed["count_adverbs"].append(count_adverbs(lyrics))
            
               
    return parsed;

def calc_unique_line_ratio(lyrics):
    return len(set(lyrics)) / len(lyrics)

def make_lyric_string(lyric_lines):
    lyric_string = ''
    for l in lyric_lines:
        lyric_string += l+' '
    return lyric_string 

def count_words(lyric_string):
    return len(lyric_string.split())

def calc_unique_word_ratio_song(lyric_string):
    return len(set(lyric_string.split())) / len(lyric_string.split())

def count_characters(lyrics_string):
    return len(lyrics_string)-lyrics_string.count(' ')

def find_rhymes(lyrics_string):
    n_rhymes = 0
    for index in range(0, len(lyrics_string)):
        if index > 0:
            if lyrics_string[index].split()[-1] in pronouncing.rhymes(lyrics_string[index-1].split()[-1]):
                n_rhymes += 1
    return n_rhymes

def count_nouns(lyrics_string):
    tokenized = []
    for sent in lyrics_string:
        tokenized.append(word_tokenize(sent))
    tagged = pos_tag_sents(tokenized,tagset='universal')
    concat = [item for sublist in tagged for item in sublist]
    counts = Counter([j for i,j in concat])
    return counts['NOUN']
    

def count_verbs(lyrics_string):
    tokenized = []
    for sent in lyrics_string:
        tokenized.append(word_tokenize(sent))
    tagged = pos_tag_sents(tokenized,tagset='universal')
    concat = [item for sublist in tagged for item in sublist]
    counts = Counter([j for i,j in concat])
    return counts['VERB']

def count_adjectives(lyrics_string):
    tokenized = []
    for sent in lyrics_string:
        tokenized.append(word_tokenize(sent))
    tagged = pos_tag_sents(tokenized,tagset='universal')
    concat = [item for sublist in tagged for item in sublist]
    counts = Counter([j for i,j in concat])
    return counts['ADJ']

def count_pronouns(lyrics_string):
    tokenized = []
    for sent in lyrics_string:
        tokenized.append(word_tokenize(sent))
    tagged = pos_tag_sents(tokenized,tagset='universal')
    concat = [item for sublist in tagged for item in sublist]
    counts = Counter([j for i,j in concat])
    return counts['PRON']

def count_adverbs(lyrics_string):
    tokenized = []
    for sent in lyrics_string:
        tokenized.append(word_tokenize(sent))
    tagged = pos_tag_sents(tokenized,tagset='universal')
    concat = [item for sublist in tagged for item in sublist]
    counts = Counter([j for i,j in concat])
    return counts['AVD']

##Parse The Data
parsed = parse_lyrics()
##Prepare "Dumb" Features
vectorizer = TfidfVectorizer(stop_words="english")
data_bow = vectorizer.fit_transform(parsed["raw_lyrics"]).toarray()

##Make More intelligent Features
data_feature = vectorizer.fit_transform(parsed["raw_lyrics"]).toarray()

data_feature = np.append(data_feature, np.asarray(parsed["unique_line_ratios"]).reshape(-1, 1), 1)
data_feature = np.append(data_feature, normalize(np.asarray(parsed["unique_word_ratios_song"]).reshape(-1, 1)), 1)

data_feature = np.append(data_feature, normalize(np.asarray(parsed["total_word_counts"]).reshape(-1, 1)), 1)
data_feature = np.append(data_feature, normalize(np.asarray(parsed["average_character_count_word"]).reshape(-1, 1)), 1)
data_feature = np.append(data_feature, normalize(np.asarray(parsed["average-words-per-line"]).reshape(-1, 1)), 1)

data_feature = np.append(data_feature, normalize(np.asarray(parsed["count_nouns"]).reshape(-1, 1)), 1)
data_feature = np.append(data_feature, normalize(np.asarray(parsed["count_verbs"]).reshape(-1, 1)), 1)
data_feature = np.append(data_feature, normalize(np.asarray(parsed["count_adjectives"]).reshape(-1, 1)), 1)
data_feature = np.append(data_feature, normalize(np.asarray(parsed["count_pronouns"]).reshape(-1, 1)), 1)
data_feature = np.append(data_feature, normalize(np.asarray(parsed["count_adverbs"]).reshape(-1, 1)), 1)

titles_train, titels_test, raw_lyrics_train, raw_lyrics_test, X_train_feature, X_test_feature, X_train_bow, X_test_bow, y_train, y_test = train_test_split(parsed['titles'], parsed['raw_lyrics'], data_feature, data_bow, parsed['genres'], test_size=0.20, random_state=46)



Blues
Country
EDM
Metal
Rap
Rock
[]


ValueError: empty vocabulary; perhaps the documents only contain stop words

Classify
---

In [3]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LassoCV

cls = OneVsRestClassifier(SVC(kernel="linear", random_state=0))
gkf = GroupKFold(n_splits = 10)
lasso_cv = LassoCV(alphas=alphas, random_state=0)
alphas = np.logspace(-4, -0.5, 30)

for k, (train,test) in enumerate(gkf.split(parsed['raw_lyrics'],parsed['genre'],parsed['artists'])):
    lasso_cv.fit(X[train], y[train])
    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
          format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))

cls.fit(X_train_feature, y_train)
pred_feature = cls.predict(X_test_feature)
#pred_props = cls.predict_proba(X_test_feature)

cls.fit(X_train_bow, y_train)
pred_bow = cls.predict(X_test_bow)

Evaluate
---

In [4]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools
import numpy as np

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

    
print("### BOW Classifier ###\n")    
plot_confusion_matrix(confusion_matrix(y_test, pred_bow), ["Blues","Country", "EDM", "Metal", "Rap", "Rock"])
print ("F1 micro: " + str(f1_score(pred_bow, y_test, average='micro')))
print("F1 macro: " + str(f1_score(pred_bow, y_test, average='macro')))

print("\n\n### Feature Classifier ###\n")    
plot_confusion_matrix(confusion_matrix(y_test, pred_feature), ["Blues","Country", "EDM", "Metal", "Rap", "Rock"])
print ("F1 micro: " + str(f1_score(pred_feature, y_test, average='micro')))
print("F1 macro: " + str(f1_score(pred_feature, y_test, average='macro')))

### BOW Classifier ###

F1 micro: 0.677685950413
F1 macro: 0.659404616632


### Feature Classifier ###

F1 micro: 0.710743801653
F1 macro: 0.696013205649


Examine Mistakes in More Detail
---

In [5]:
for idx, genre in enumerate(y_test):
    if (genre != pred_feature[idx]):
        print ("\n###########################################################\n")
        print ("Song: "+titels_test[idx]+"\n")
        print ("Was "+genre+" Predicted "+pred_feature[idx] + "\n")
        #print ("confidence: ", pred_props[idx])
        print ("\n"+raw_lyrics_test[idx])
        


###########################################################

Song: Bad

Was EDM Predicted Blues


I say why does it feel so good? So good to be bad Getting what I want, boy Why does that make you so mad? You see, why does it feel so good? So good to be bad 'Cause if it's trouble that you're looking for Oh baby, here I am Oh baby, here I am So why does it feel so good? So good to be bad Bad I say why does it feel so good? So good to be bad Getting what I want, boy Why does that make you so mad? You see, why does it feel so good? So good to be bad 'Cause if it's trouble that you're looking for Oh baby, here I am Oh baby, here I am 

###########################################################

Song: Sassy

Was Rock Predicted Rap


We're rolling, you guys Why? Why? Why? Why? Why? Why? Why am I here? Why am I there? I don't care CARE! Oh, why I'm old Sit back down Sit back down Sit back down 'CAUSE I SAID SO! [Inger Lorre:] "You're so stupid I would have done anything for you Honey, your r

Look At The Most Informative Words
---

In [6]:
def print_top10(vectorizer, clf, class_labels):
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-100:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))
        
print_top10(vectorizer, cls,  ["Blues","Country", "EDM", "Metal", "Rap", "Rock"])

Blues: nights brand anytime stormy good boogie bury looks beer bobby change walkin gulfport bills child hearted road uumh michigan detroit ooh treats sox ah stuck lesson sit jaw slow legged hey drag hmm vain station switch luck clothes eve babe shotgun got drunken girl automobile natural mood sittin dimples little lee lord fun pretty oww mean save scotch bourbon drivin payin poor know wimmen lovin early starter times crawlin lodi roll rooster whoo hooo sure ain mm short long spoon glamour rain treat tell crossroad man cryin nough christmas goin want dog mornin gonna blues yes spoonful mmm woman baby
Country: hits brought kettle whiskey filled old shelter cradle folks leave hands today run dew tells forgotten ramblers dixie comfort corner birmingham saddle freight sweetwater keeps proud started new mem nah hair aching haven memory prison played wish talk shines wind dear watching free past walked care changes comforts left em hardly grass booth ev expecting line haunted rock hurtin toni