In [1]:
pwd

'/Users/tesacs/Documents/GitHub/cypher/notebooks'

In [16]:
import pandas as pd
import numpy as np

lyrics_by_song = pd.read_csv('lyr_df.csv')
lyrics_by_song['song_genre'] = np.where(lyrics_by_song['hip_hop_rnb'] == 0.0, 'Country/Rock', 'Hiphop/RnB')
lyrics_by_song.head(10)

Unnamed: 0,song,artist,hip_hop_rnb,lyrics,song_genre
0,Beautiful Crazy,Luke Combs,0.0,Her day starts with a coffee and ends with a w...,Country/Rock
1,Best Shot,Jimmie Allen,0.0,"I'm just flesh and bone, heart and soul And I'...",Country/Rock
2,Blue Tacoma,Russell Dickerson,0.0,"Blue Tacoma, California Rays of gold are shini...",Country/Rock
3,Broken Halos,Chris Stapleton,0.0,Seen my share of broken halos Folded wings tha...,Country/Rock
4,Even Though I'm Leaving,Luke Combs,0.0,"Daddy, I'm afraid, won't you stay a little whi...",Country/Rock
5,Every Little Thing,Russell Dickerson,0.0,"Who! Huh, yeah My baby, she's Alabama A dixie ...",Country/Rock
6,Eyes on You,Chase Rice,0.0,We've been to both Carolinas Seen a big Montan...,Country/Rock
7,For the First Time,Darius Rucker,0.0,You say you never danced to a dashboard singin...,Country/Rock
8,Get Along,Kenny Chesney,0.0,"Met a man wearin' a T-shirt, says ""Virginia is...",Country/Rock
9,Girl,Maren Morris,0.0,"Man, this shit's unflatterin', all up in my he...",Country/Rock


In [17]:
from sklearn.utils import shuffle
from nltk.corpus import stopwords

genres = [
    'Country/Rock','Hiphop/RnB'
]
genres

['Country/Rock', 'Hiphop/RnB']

In [18]:
LYRIC_LEN = 400 # each song has to be > 400 characters
N = 100 # number of records to pull from each genre
RANDOM_SEED = 200 # random seed to make results repeatable

train_df = pd.DataFrame()
test_df = pd.DataFrame()
for genre in genres: # loop over each genre
    subset = lyrics_by_song[ # create a subset 
        (lyrics_by_song.song_genre==genre) & 
        (lyrics_by_song.lyrics.str.len() > LYRIC_LEN)
    ]
    train_set = subset.sample(n=N, random_state=RANDOM_SEED)
    test_set = subset.drop(train_set.index)
    train_df = train_df.append(train_set) # append subsets to the master sets
    test_df = test_df.append(test_set)

In [19]:
train_df

Unnamed: 0,song,artist,hip_hop_rnb,lyrics,song_genre
145,Lola Montez,Volbeat,0.0,Feel the fire where she walks Lola Montez so b...,Country/Rock
162,Panic Attack,The Glorious Sons,0.0,"I wanna be normal, I wanna be sane I wanna loo...",Country/Rock
189,Still Counting,Volbeat,0.0,"Counting all the assholes in the room Well, I'...",Country/Rock
191,Stone,Alice in Chains,0.0,I know you think I'm wrong But I'm not your to...,Country/Rock
104,Fall to Pieces,Velvet Revolver,0.0,It's been a long year Since you've been gone I...,Country/Rock
...,...,...,...,...,...
463,The Hills,The Weeknd,1.0,"Yeah Yeah Yeah Your man on the road, he doin' ...",Hiphop/RnB
473,Twisted,Keith Sweat,1.0,Oh yeah baby You got to make your mind up Yeah...,Hiphop/RnB
540,Say Something,Timbaland,,"This shit was all I knew, you and me only I di...",Hiphop/RnB
301,Fancy,Iggy Azalea,1.0,"First things first, I'm the realest (Realest) ...",Hiphop/RnB


In [20]:
test_df

Unnamed: 0,song,artist,hip_hop_rnb,lyrics,song_genre
0,Beautiful Crazy,Luke Combs,0.0,Her day starts with a coffee and ends with a w...,Country/Rock
1,Best Shot,Jimmie Allen,0.0,"I'm just flesh and bone, heart and soul And I'...",Country/Rock
2,Blue Tacoma,Russell Dickerson,0.0,"Blue Tacoma, California Rays of gold are shini...",Country/Rock
3,Broken Halos,Chris Stapleton,0.0,Seen my share of broken halos Folded wings tha...,Country/Rock
6,Eyes on You,Chase Rice,0.0,We've been to both Carolinas Seen a big Montan...,Country/Rock
...,...,...,...,...,...
714,Wow,Post Malone,,"Said she tired of little money, need a big boy...",Hiphop/RnB
715,Truth Hurts,Lizzo,,Why men great 'til they gotta be great? Woo! I...,Hiphop/RnB
716,Highest in the Room,Travis Scott,,I got room in my fumes (Yeah) She fill my mind...,Hiphop/RnB
717,Truth Hurts,Lizzo,,Why men great 'til they gotta be great? Woo! I...,Hiphop/RnB


In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# define our model
text_clf = Pipeline(
    [('vect', CountVectorizer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)

0.8285163776493256

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

# define our model
text_clf = Pipeline(
    [('vect', TfidfVectorizer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)

0.8766859344894027

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/tesacs/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/tesacs/nltk_data...


True

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

stop = list(set(stopwords.words('english'))) # stopwords
wnl = WordNetLemmatizer() # lemmatizer

def tokenizer(x): # custom tokenizer
    return (
        wnl.lemmatize(w) 
        for w in word_tokenize(x) 
        if len(w) > 2 and w.isalnum() # only words that are > 2 characters
    )                                 # and is alpha-numeric

# define our model
text_clf = Pipeline(
    [('vect', TfidfVectorizer(
        ngram_range=(1, 2), # include bigrams
        tokenizer=tokenizer,
        stop_words=stop,
        max_df=0.4, # ignore terms that appear in more than 40% of documents
        min_df=4)), # ignore terms that appear in less than 4 documents
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB(alpha=0.1))])

# train our model on training data
text_clf.fit(train_df.lyrics, train_df.song_genre)  

# score our model on testing data
predicted = text_clf.predict(test_df.lyrics)
np.mean(predicted == test_df.song_genre)



0.8053949903660886