In [2]:
import pandas as pd
import numpy as np

song_lyrics = pd.read_csv('lyr_df.csv')
song_lyrics['song_genre'] = np.where(song_lyrics['hip_hop_rnb'] == 0, 'Country/Rock','Hiphop/RnB')
song_lyrics.head(10)

Unnamed: 0,song,artist,hip_hop_rnb,genre,lyrics,song_genre
0,(if you're not in it for love) i'm outta here!,Shania Twain,0,Country / Rock,"three, four, five mind if i sit down? can i bu...",Country/Rock
1,(this ain't) no thinkin' thing,Trace Adkins,0,Country / Rock,i been thinkin' about our love situation all t...,Country/Rock
2,1 thing,Amerie,1,Hip Hop / RnB,"woo! uh woo! na-na-na-na-na, oh (uh, uh) na-na...",Hiphop/RnB
3,1000hp,Godsmack,0,Country / Rock,time to rewind back to 1995 when we were nothi...,Country/Rock
4,16,Highly Suspect,0,Country / Rock,it took me sixteen years to find ya one secon...,Country/Rock
5,19 somethin',Mark Wills,0,Country / Rock,woo oh yeah i saw star wars at least eight tim...,Country/Rock
6,911 is a joke,Public Enemy,1,Hip Hop / RnB,"hit me! ""somebody call an ambulance!"" ""there's...",Hiphop/RnB
7,a better man,Clint Black,0,Country / Rock,what do you say when it's over? i don't know i...,Country/Rock
8,a broken wing,Martina McBride,0,Country / Rock,she loved him like he was the last man on eart...,Country/Rock
9,a good run of bad luck,Clint Black,0,Country / Rock,a high roller even when the chips are down to ...,Country/Rock


In [23]:
from sklearn.utils import shuffle
from nltk.corpus import stopwords

genres = [
    'Country/Rock','Hiphop/RnB'
]
genres

['Country/Rock', 'Hiphop/RnB']

In [24]:
LYRIC_LEN = 400 # each song has to be > 400 characters
N = 100 # number of records to pull from each genre
RANDOM_SEED = 200 # random seed to make results repeatable

train_df = pd.DataFrame()
test_df = pd.DataFrame()
for genre in genres: # loop over each genre
    subset = song_lyrics[ # create a subset 
        (song_lyrics.song_genre==genre) & 
        (song_lyrics.lyrics.str.len() > LYRIC_LEN)
    ]
    train_set = subset.sample(n=N, random_state=RANDOM_SEED)
    test_set = subset.drop(train_set.index)
    train_df = train_df.append(train_set) # append subsets to the master sets
    test_df = test_df.append(test_set)

In [25]:
train_df

Unnamed: 0,song,artist,hip_hop_rnb,lyrics,song_genre
145,Lola Montez,Volbeat,0.0,Feel the fire where she walks Lola Montez so b...,Country/Rock
162,Panic Attack,The Glorious Sons,0.0,"I wanna be normal, I wanna be sane I wanna loo...",Country/Rock
189,Still Counting,Volbeat,0.0,"Counting all the assholes in the room Well, I'...",Country/Rock
191,Stone,Alice in Chains,0.0,I know you think I'm wrong But I'm not your to...,Country/Rock
104,Fall to Pieces,Velvet Revolver,0.0,It's been a long year Since you've been gone I...,Country/Rock
...,...,...,...,...,...
463,The Hills,The Weeknd,1.0,"Yeah Yeah Yeah Your man on the road, he doin' ...",Hiphop/RnB
473,Twisted,Keith Sweat,1.0,Oh yeah baby You got to make your mind up Yeah...,Hiphop/RnB
540,Say Something,Timbaland,,"This shit was all I knew, you and me only I di...",Hiphop/RnB
301,Fancy,Iggy Azalea,1.0,"First things first, I'm the realest (Realest) ...",Hiphop/RnB


In [26]:
test_df

Unnamed: 0,song,artist,hip_hop_rnb,lyrics,song_genre
0,Beautiful Crazy,Luke Combs,0.0,Her day starts with a coffee and ends with a w...,Country/Rock
1,Best Shot,Jimmie Allen,0.0,"I'm just flesh and bone, heart and soul And I'...",Country/Rock
2,Blue Tacoma,Russell Dickerson,0.0,"Blue Tacoma, California Rays of gold are shini...",Country/Rock
3,Broken Halos,Chris Stapleton,0.0,Seen my share of broken halos Folded wings tha...,Country/Rock
6,Eyes on You,Chase Rice,0.0,We've been to both Carolinas Seen a big Montan...,Country/Rock
...,...,...,...,...,...
714,Wow,Post Malone,,"Said she tired of little money, need a big boy...",Hiphop/RnB
715,Truth Hurts,Lizzo,,Why men great 'til they gotta be great? Woo! I...,Hiphop/RnB
716,Highest in the Room,Travis Scott,,I got room in my fumes (Yeah) She fill my mind...,Hiphop/RnB
717,Truth Hurts,Lizzo,,Why men great 'til they gotta be great? Woo! I...,Hiphop/RnB


In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# define our model
text_clf = Pipeline(
    [('vect', CountVectorizer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)

0.8285163776493256

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

# define our model
text_clf = Pipeline(
    [('vect', TfidfVectorizer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)

0.8766859344894027

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/tesacs/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/tesacs/nltk_data...


True

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

stop = list(set(stopwords.words('english'))) # stopwords
wnl = WordNetLemmatizer() # lemmatizer

def tokenizer(x): # custom tokenizer
    return (
        wnl.lemmatize(w) 
        for w in word_tokenize(x) 
        if len(w) > 2 and w.isalnum() # only words that are > 2 characters
    )                                 # and is alpha-numeric

# define our model
text_clf = Pipeline(
    [('vect', TfidfVectorizer(
        ngram_range=(1, 2), # include bigrams
        tokenizer=tokenizer,
        stop_words=stop,
        max_df=0.4, # ignore terms that appear in more than 40% of documents
        min_df=4)), # ignore terms that appear in less than 4 documents
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)



0.8053949903660886