# Importing the required packages

In [3]:
from nltk.classify import PositiveNaiveBayesClassifier
import json
import pandas as pd
import random

# Reading the Dataset

In [35]:
df = pd.read_csv("cleaned_lyrics_with_genres_wordcount_and_vocab.csv")
df = df.drop_duplicates('song_key')
# total number of songs
allsongs= len(df)*1.0
df.head(5)

Unnamed: 0.1,Unnamed: 0,song_key,lyrics,lyrics_url,lyrics_abstract,decade,artist,title,year,band_singer,...,/wiki/Western_swing,/wiki/Witch_house,/wiki/World_music,/wiki/Worldbeat,/wiki/Worship_music,/wiki/Zydeco,wordcount,wordset,lexdiv,repetition_score
0,627,1976-86,Are you ready\nDo what you wanna do\nDo what y...,http://lyrics.wikia.com/Ohio_Players:Who%27d_S...,Are you ready\nDo what you wanna do\nDo what y...,1970,Ohio Players,Who'd She Coo?,1976,Ohio Players,...,False,False,False,False,False,False,35,26,0.742857,1.346154
1,1375,1984-59,I thought that dreams belonged to other men 'C...,http://lyrics.wikia.com/index.php?title=Mike_R...,I thought that dreams belonged to other men 'C...,1980,Mike Reno,Almost Paradise,1984,Ann Wilson,...,False,False,False,False,False,False,144,104,0.722222,1.384615
3,990,1980-61,Oh I could hide 'neath the wings of the blue b...,http://lyrics.wikia.com/Anne_Murray:Daydream_B...,Oh I could hide 'neath the wings of the blue b...,1980,Anne Murray,Daydream Believer,1980,Anne Murray,...,False,False,False,False,False,False,114,79,0.692982,1.443038
4,153,1971-80,Amazing grace! How sweet the sound. That saved...,http://lyrics.wikia.com/Judy_Collins:Amazing_G...,Amazing grace! How sweet the sound. That saved...,1970,Judy Collins,Amazing Grace,1971,Judy Collins,...,False,False,False,False,False,False,128,86,0.671875,1.488372
5,645,1977-4,"Love, soft as an easy chair. Love, fresh as th...",http://lyrics.wikia.com/Barbra_Streisand:Everg...,"Love, soft as an easy chair. Love, fresh as th...",1970,Barbra Streisand,Evergreen (Love Theme from A Star Is Born),1977,Barbra Streisand,...,False,False,False,False,False,False,128,82,0.640625,1.560976


# Getting the top 15 genres and the genre list

In [5]:
with open("songs_by_genre.json") as json_file:
    genres = json.load(json_file)
   
genrelist= genres.keys()
genres
top_genres={}
for i in genres.keys():
    top_genres[i]=len(genres[i])
d=top_genres
genre_list=[]

rank=0
for g in sorted(d, key=d.get, reverse=True):
    if rank < 15:
        genre_list.append((rank,g, d[g]/allsongs))
        rank +=1
#genre_list now holds the genre's id number, the genre, and its freq in the overall population        
genre_list

[(0, '/wiki/Pop_music', 0.7867435158501441),
 (1, '/wiki/Hip_hop_music', 0.6296829971181557),
 (2, '/wiki/Contemporary_R%26B', 0.6162343900096061),
 (3, '/wiki/Soul_music', 0.41306436119116235),
 (4, '/wiki/Rock_music', 0.3621517771373679),
 (5, '/wiki/Pop_rock', 0.3520653218059558),
 (6, '/wiki/Soft_rock', 0.24639769452449567),
 (7, '/wiki/Country_music', 0.1988472622478386),
 (8, '/wiki/Rhythm_and_blues', 0.19548511047070125),
 (9, '/wiki/Alternative_rock', 0.16234390009606148),
 (10, '/wiki/Funk', 0.15994236311239193),
 (11, '/wiki/Hard_rock', 0.15417867435158503),
 (12, '/wiki/Dance_music', 0.14169068203650337),
 (13, '/wiki/Dance-pop', 0.14169068203650337),
 (14, '/wiki/Disco', 0.13160422670509125)]

# Splitting the training and testing data 

In [6]:
dftrain = df.sample(frac=0.8)
dftest =  df.loc[~df.index.isin(dftrain.index)]
dftrain.shape, dftest.shape

((3331, 454), (833, 454))

In [7]:
def features(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)


# Training 15 Classifiers

In [9]:
classdict={}
def trainclassifiers(prior):
    for genre_tuple in genre_list:
        g_index=genre_tuple[0]
        genre = genre_tuple[1]
        g_prior= genre_tuple[2]

        in_genre_df = dftrain[dftrain[genre]==True]
        out_genre_df = dftrain[dftrain[genre]==False]

        # concatentate the lyrics from every song into a "sentence"
        in_sentences=[]
        out_sentences=[]
        for row in in_genre_df.iterrows():
            songsents= row[1][2].split('.')
            for s in songsents:
                in_sentences.append(s)

        for row in out_genre_df.iterrows():
            songsents= row[1][2].split('.')
            for s in songsents:
                out_sentences.append(s)
        #setting the prior
        if prior == 0:
            pprior = g_prior*.0967
        else:
            pprior=  prior 
        positive_featuresets = list(map(features, in_sentences))
        unlabeled_featuresets = list(map(features, out_sentences))
        classdict[int(g_index)] = PositiveNaiveBayesClassifier.train(positive_featuresets,unlabeled_featuresets,positive_prob_prior=pprior)


# Runs the text against the 15 classifiers and prints the ones that return true

In [27]:
def show_genres(classtext):
    noresult = True
    for genre_tuple in genre_list:
        g_index=genre_tuple[0]
        genre = genre_tuple[1]
        g_prior= genre_tuple[2]
        if classdict[g_index].classify(features(classtext)):
            print(genre[6:])
            noresult = False
    if noresult:
        print("No match")


In [28]:
results={}
# this function iterates over the values it contains for the prior
# and for each iterates over the rows of the testing set,
# selecting a random sentence from each one
# then running the battery of 15 classifiers on the sentence
for p in [.01,.02,.03,.05,.1,.2,.3,0]:
    trainclassifiers(p)
    true_pos=0
    true_neg=0
    false_pos=0
    false_neg=0
    blpos=0
    bltotal=0
    # For each row in the test set
    for row in dftest.iterrows():
        # getting a sentence from the song 
        song_sentences=[]
        songsents= row[1][2].split('.')
        if len(songsents)>0:
            for s in songsents:
                song_sentences.append(s)
        songsent=random.choice(song_sentences)
        for genre_tuple in genre_list:
            g_index=genre_tuple[0]
            genre = genre_tuple[1]
            g_prior= genre_tuple[2]
            # sees if it's in the song's genre list
            observed =  genre in row[1][15]
            predicted = classdict[g_index].classify(features(songsent))
            if observed and predicted:
                true_pos += 1
            if not observed and not predicted:
                true_neg += 1
            if observed and not predicted:
                false_neg += 1
            if not observed and predicted:
                false_pos += 1
            # this gets the baseline predicting all false
            bltotal += 1
            if observed:
                blpos += 1
                
    print("Baseline pct. predicting all negative:", 1-1.0*blpos/bltotal,"\n\n")             

    print("Prior: ", p)
    print("True positive: ",true_pos)
    print("False positive: ",false_pos)
    print("False negative: ",false_neg)
    print("True negative: ",true_neg)
    total_obs= true_pos+false_pos+true_neg+false_neg
    print("Total observations: ",total_obs) 
    print("Predicted correct: ", (1.0*true_pos+true_neg)/total_obs,"\n\n") 
    results[str(p)] =[true_pos,false_pos,false_neg,true_neg]    


Baseline pct. predicting all negative: 0.8698679471788715 


Prior:  0.01
True positive:  43
False positive:  72
False negative:  1583
True negative:  10797
Total observations:  12495
Predicted correct:  0.867547018807523 


Baseline pct. predicting all negative: 0.8698679471788715 


Prior:  0.02
True positive:  60
False positive:  146
False negative:  1566
True negative:  10723
Total observations:  12495
Predicted correct:  0.8629851940776311 


Baseline pct. predicting all negative: 0.8698679471788715 


Prior:  0.03
True positive:  99
False positive:  209
False negative:  1527
True negative:  10660
Total observations:  12495
Predicted correct:  0.8610644257703082 


Baseline pct. predicting all negative: 0.8698679471788715 


Prior:  0.05
True positive:  137
False positive:  350
False negative:  1489
True negative:  10519
Total observations:  12495
Predicted correct:  0.8528211284513806 


Baseline pct. predicting all negative: 0.8698679471788715 


Prior:  0.1
True positive:  174


## In the cell below, replace the text between the quotes in any of the cells below with text of our choice. The classifier will run against each genre and report which ones it considers a match. If the matches are coming up empty, you can increase the sensitivity by de-commenting the line that runs the classifer training with a higher prior value. Ideally, choose priors which are higher than 0.5.

In [29]:
%%time
trainclassifiers(0.15)
show_genres('Im just a simple girl. In a high-tech digital')

Hip_hop_music
Country_music
Funk
Dance-pop
CPU times: user 1min 27s, sys: 1.4 s, total: 1min 28s
Wall time: 1min 29s


In [30]:
show_genres('I was bruised and battered, I couldnt ')

Pop_music
Rock_music
Country_music
Alternative_rock
Hard_rock
Dance-pop
Disco


In [31]:
show_genres('You get a shiver in the dark. Its raining ')

Soft_rock
Country_music


In [32]:
show_genres('gun shoot pistol bullet dead or alive')

Hip_hop_music
Alternative_rock
Hard_rock


In [33]:
show_genres('Ride, I used to jump my horse and ride.')

Hip_hop_music
Hard_rock


# We observe that there are no matches for the following text

In [34]:
show_genres('kdclaskjclalcckdcsjd asnckajnc adjcla')

No match
