In [1]:
import pandas as pd
import numpy as np

song_lyrics = pd.read_csv('../data/lyr_df.csv')
song_lyrics.head(10)

Unnamed: 0,song,artist,hip_hop_rnb,genre,lyrics
0,(if you're not in it for love) i'm outta here!,Shania Twain,0,Country / Rock,"three, four, five mind if i sit down? can i bu..."
1,(this ain't) no thinkin' thing,Trace Adkins,0,Country / Rock,i been thinkin' about our love situation all t...
2,1 thing,Amerie,1,Hip Hop / RnB,"woo! uh woo! na-na-na-na-na, oh (uh, uh) na-na..."
3,1000hp,Godsmack,0,Country / Rock,time to rewind back to 1995 when we were nothi...
4,16,Highly Suspect,0,Country / Rock,it took me sixteen years to find ya one secon...
5,19 somethin',Mark Wills,0,Country / Rock,woo oh yeah i saw star wars at least eight tim...
6,911 is a joke,Public Enemy,1,Hip Hop / RnB,"hit me! ""somebody call an ambulance!"" ""there's..."
7,a better man,Clint Black,0,Country / Rock,what do you say when it's over? i don't know i...
8,a broken wing,Martina McBride,0,Country / Rock,she loved him like he was the last man on eart...
9,a good run of bad luck,Clint Black,0,Country / Rock,a high roller even when the chips are down to ...


In [18]:
from sklearn.utils import shuffle
from nltk.corpus import stopwords

genres = [
    'Country/Rock','Hiphop/RnB'
]
genres

['Country/Rock', 'Hiphop/RnB']

In [19]:
LYRIC_LEN = 400 # each song has to be > 400 characters
N = 300 # number of records to pull from each genre
RANDOM_SEED = 200 # random seed to make results repeatable

train_df = pd.DataFrame()
test_df = pd.DataFrame()
for genre in genres: # loop over each genre
    subset = song_lyrics[ # create a subset 
        (song_lyrics.song_genre==genre) & 
        (song_lyrics.lyrics.str.len() > LYRIC_LEN)
    ]
    train_set = subset.sample(n=N, random_state=RANDOM_SEED)
    test_set = subset.drop(train_set.index)
    train_df = train_df.append(train_set) # append subsets to the master sets
    test_df = test_df.append(test_set)

In [20]:
train_df

Unnamed: 0,song,artist,hip_hop_rnb,genre,lyrics,song_genre
339,good as you,Kane Brown,0,Country / Rock,see the way you're taking care of your mama th...,Country/Rock
1081,torn to pieces,Pop Evil,0,Country / Rock,here i sit all alone like an airplane on the e...,Country/Rock
54,angel eyes,Love and Theft,0,Country / Rock,she likes whiskey with her water she starts da...,Country/Rock
1042,the world,Brad Paisley,0,Country / Rock,to the teller down at the bank you're just ano...,Country/Rock
42,alone with you,Jake Owen,0,Country / Rock,i don't see you laugh you don't call me back b...,Country/Rock
...,...,...,...,...,...,...
95,be without you,Mary J. Blige,1,Hip Hop / RnB,"i wanna be with you, gotta be with you, need t...",Hiphop/RnB
1126,we belong together,Mariah Carey,1,Hip Hop / RnB,"sweet love, yeah i didn't mean it when i said ...",Hiphop/RnB
735,nobody's supposed to be here,Deborah Cox,1,Hip Hop / RnB,how did you get here? nobody's supposed to be ...,Hiphop/RnB
699,my prerogative,Bobby Brown,1,Hip Hop / RnB,get busy! everybody's talkin' all this stuff a...,Hiphop/RnB


In [21]:
test_df

Unnamed: 0,song,artist,hip_hop_rnb,genre,lyrics,song_genre
0,(if you're not in it for love) i'm outta here!,Shania Twain,0,Country / Rock,"three, four, five mind if i sit down? can i bu...",Country/Rock
3,1000hp,Godsmack,0,Country / Rock,time to rewind back to 1995 when we were nothi...,Country/Rock
4,16,Highly Suspect,0,Country / Rock,it took me sixteen years to find ya one secon...,Country/Rock
8,a broken wing,Martina McBride,0,Country / Rock,she loved him like he was the last man on eart...,Country/Rock
9,a good run of bad luck,Clint Black,0,Country / Rock,a high roller even when the chips are down to ...,Country/Rock
...,...,...,...,...,...,...
1095,u remind me,Usher,1,Hip Hop / RnB,"yo, i ain't seen you in a minute but i got som...",Hiphop/RnB
1101,understanding,Xscape,1,Hip Hop / RnB,what i need from you is understanding how can ...,Hiphop/RnB
1187,wobble wobble,504 Boyz,1,Hip Hop / RnB,"yo', this jay-tweezie keepin it live off the h...",Hiphop/RnB
1222,you remind me,Mary J. Blige,1,Hip Hop / RnB,"ooh you remind me, yeah you remind me of such ...",Hiphop/RnB


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# define our model
text_clf = Pipeline(
    [('vect', CountVectorizer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)

0.9025157232704403

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

# define our model
text_clf = Pipeline(
    [('vect', TfidfVectorizer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)

0.910377358490566

In [None]:
#import nltk
#nltk.download('punkt')
#nltk.download('wordnet')

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

stop = list(set(stopwords.words('english'))) # stopwords
wnl = WordNetLemmatizer() # lemmatizer

def tokenizer(x): # custom tokenizer
    return (
        wnl.lemmatize(w) 
        for w in word_tokenize(x) 
        if len(w) > 2 and w.isalnum() # only words that are > 2 characters
    )                                 # and is alpha-numeric

# define our model
text_clf = Pipeline(
    [('vect', TfidfVectorizer(
        ngram_range=(1, 2), # include bigrams
        tokenizer=tokenizer,
        stop_words=stop,
        max_df=0.4, # ignore terms that appear in more than 40% of documents
        min_df=4)), # ignore terms that appear in less than 4 documents
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)



0.8333333333333334