In [1]:
import pandas as pd
import numpy as np
import sys
import json
import os
import re

In [2]:
from sklearn.feature_extraction import text      
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
SONGS_PER_GENRE=10000
SONGS_PER_TRAINING=2000
SONGS_PER_TESTING=200

In [5]:
def cleanse(text):
  result=re.sub('[^a-zA-Z0-9]',' ',text)
  return result

In [6]:
partists = pd.read_csv("artists-data.csv")
psongs = pd.read_csv("lyrics-data.csv")

In [7]:
pop_artists = partists[partists['Genres'] == 'Pop']
pop_songs = pd.merge(psongs, pop_artists, how='inner', left_on='ALink', right_on='Link')
pop_songs.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language,Artist,Genres,Songs,Popularity,Link
0,/luisa-sonza/,Café da Manhã ;P (Com Ludmilla),/luisa-sonza/cafe-da-manha-p-com-ludmilla.html,Hm\nEu tô falando sério\nSei que de manhã tu j...,pt,Luísa Sonza,Pop,46.0,8.6,/luisa-sonza/
1,/luisa-sonza/,Anaconda *o* ~~~ (Com Mariah Angeliq),/luisa-sonza/anaconda-o-com-mariah-angeliq.html,Baby\nÉ cada coisa que eu faco que você nem sa...,pt,Luísa Sonza,Pop,46.0,8.6,/luisa-sonza/
2,/luisa-sonza/,MULHER DO ANO XD,/luisa-sonza/mulher-do-ano-xd.html,"Não se emociona que eu já tô sem tempo\nBaby, ...",pt,Luísa Sonza,Pop,46.0,8.6,/luisa-sonza/
3,/luisa-sonza/,penhasco.,/luisa-sonza/penhasco.html,Sabia que a queda era grande\nMas tive que pul...,pt,Luísa Sonza,Pop,46.0,8.6,/luisa-sonza/
4,/luisa-sonza/,melhor sozinha :-)-:,/luisa-sonza/melhor-sozinha.html,Eu gosto tanto de você\nMas isso tudo me dá\nF...,pt,Luísa Sonza,Pop,46.0,8.6,/luisa-sonza/


In [8]:
pop_songs = pop_songs[['Genres', 'Artist', 'SName', 'Lyric']].rename(columns = {'Genres' : 'Genre', 'SName' : 'Song'})
pop_songs = pop_songs.dropna()
pop_songs = pop_songs[pop_songs['Lyric'] != 'Instrumental'].head(SONGS_PER_GENRE).applymap(cleanse)
pop_songs.head()

Unnamed: 0,Genre,Artist,Song,Lyric
0,Pop,Lu sa Sonza,Caf da Manh P Com Ludmilla,Hm Eu t falando s rio Sei que de manh tu j ...
1,Pop,Lu sa Sonza,Anaconda o Com Mariah Angeliq,Baby cada coisa que eu faco que voc nem sab...
2,Pop,Lu sa Sonza,MULHER DO ANO XD,N o se emociona que eu j t sem tempo Baby e...
3,Pop,Lu sa Sonza,penhasco,Sabia que a queda era grande Mas tive que pula...
4,Pop,Lu sa Sonza,melhor sozinha,Eu gosto tanto de voc Mas isso tudo me d Fri...


In [9]:
rartists = pd.read_csv("SpotifyFeatures.csv")
rsongs = pd.read_csv("labeled_lyrics_cleaned.csv")
rartists.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [10]:
rap_artists=rartists[rartists['genre']=='Rap'][['genre','artist_name']].drop_duplicates()
rap_songs = pd.merge(rsongs, rap_artists, how='inner', left_on='artist', right_on='artist_name')
rap_songs = rap_songs[['genre', 'artist', 'song', 'seq']].rename(columns={'genre':'Genre', 'artist':'Artist', 'song':'Song','seq':'Lyric'})
rap_songs = rap_songs.dropna().head(SONGS_PER_GENRE).applymap(cleanse)
rap_songs.head()

Unnamed: 0,Genre,Artist,Song,Lyric
0,Rap,Juelz Santana,Back to the Crib,Santana Chris Breezy P O L O Da Don Shawt...
1,Rap,Juelz Santana,Intro,Yo who that Me it s Ja Ja Come in man ...
2,Rap,Juelz Santana,Dipset Santana s Town Skit,Yeah let me call this bitch up hello Yo wh...
3,Rap,Juelz Santana,My Problem Jealousy,I m sorry I mean it too let me talk ...
4,Rap,Juelz Santana,Back Again,Juelz Yeah nigga I heard my man Luca was...


In [11]:
msongs = pd.read_csv("metal_lyrics.csv")
msongs["Genre"] = "Metal"
msongs.head()

Unnamed: 0,Artist,Album,Song,Lyric,SongNum,Year,Genre
0,...AAAARRGHH,aaaarrghh,_Gecenin_G__lgesi,Kara bulutlar sardГ„В± yine dГѓВјnyamГ„В±\nKГ„...,1,0,Metal
1,...AAAARRGHH,aaaarrghh,_Son___afak,Dolunay parlak gГѓВ¶rГѓВјnmГѓВјyor bu gece\nBe...,2,0,Metal
2,...AAAARRGHH,aaaarrghh,_F__rt__na_Yakla__yyor...,Ay Г„В±Г…ВџГ„В±Г„ВџГ„В±nГ„В±n altГ„В±nda\nYaln...,3,0,Metal
3,...AAAARRGHH,aaaarrghh,_Ebedi_Buzulun_Ortas__nda,Ay Г„В±Г…ВџГ„В±Г„ВџГ„В±nГ„В±n altГ„В±nda\nYaln...,4,0,Metal
4,...AAAARRGHH,aaaarrghh,_Lanetli_Diyarlar,YГѓВјrГѓВјyorum yalnГ„В±z baГ…ВџГ„В±ma\nNereye...,5,0,Metal


In [12]:
metal_songs = msongs[['Genre', 'Artist', 'Song', 'Lyric']]
metal_songs.dropna()
metal_songs.head()

Unnamed: 0,Genre,Artist,Song,Lyric
0,Metal,...AAAARRGHH,_Gecenin_G__lgesi,Kara bulutlar sardГ„В± yine dГѓВјnyamГ„В±\nKГ„...
1,Metal,...AAAARRGHH,_Son___afak,Dolunay parlak gГѓВ¶rГѓВјnmГѓВјyor bu gece\nBe...
2,Metal,...AAAARRGHH,_F__rt__na_Yakla__yyor...,Ay Г„В±Г…ВџГ„В±Г„ВџГ„В±nГ„В±n altГ„В±nda\nYaln...
3,Metal,...AAAARRGHH,_Ebedi_Buzulun_Ortas__nda,Ay Г„В±Г…ВџГ„В±Г„ВџГ„В±nГ„В±n altГ„В±nda\nYaln...
4,Metal,...AAAARRGHH,_Lanetli_Diyarlar,YГѓВјrГѓВјyorum yalnГ„В±z baГ…ВџГ„В±ma\nNereye...


In [13]:
print("Pop -",len(pop_songs), "| Rap -", len(rap_songs), "| Metal -",len(metal_songs))

Pop - 3232 | Rap - 9049 | Metal - 228288


In [14]:
training_data = pd.concat([pop_songs.head(SONGS_PER_TRAINING), rap_songs.head(SONGS_PER_TRAINING), metal_songs.head(SONGS_PER_TRAINING)])
training_data.dropna()

Unnamed: 0,Genre,Artist,Song,Lyric
0,Pop,Lu sa Sonza,Caf da Manh P Com Ludmilla,Hm Eu t falando s rio Sei que de manh tu j ...
1,Pop,Lu sa Sonza,Anaconda o Com Mariah Angeliq,Baby cada coisa que eu faco que voc nem sab...
2,Pop,Lu sa Sonza,MULHER DO ANO XD,N o se emociona que eu j t sem tempo Baby e...
3,Pop,Lu sa Sonza,penhasco,Sabia que a queda era grande Mas tive que pula...
4,Pop,Lu sa Sonza,melhor sozinha,Eu gosto tanto de voc Mas isso tudo me d Fri...
...,...,...,...,...
1995,Metal,ABUSIVENESS,Fire And Blood,"O, wild song\nYour everlasting power\nTosses l..."
1996,Metal,ABUSIVENESS,Krzyk __witu,"W sercu nocy lat dziejГѓВіw zagubionych,\nSpra..."
1997,Metal,ABUSIVENESS,Wiecznie We Mgle,"SzumiГ„В…ce wichry co serce me, prawiecznГ„В… ..."
1998,Metal,ABUSIVENESS,Mogi__a,"OgieГ…В„ spopieli me ciaГ…В‚o,\nWiatr siГ„В™ p..."


In [15]:
cv = CountVectorizer(strip_accents='ascii', lowercase=True, stop_words='english', analyzer='word')
cv.fit(training_data['Lyric'].values.astype('U'))
bow = cv.transform(training_data['Lyric'].values.astype('U')) 
print(bow.shape[0], 'samples x ',bow.shape[1],'words in vocabulary' )

6000 samples x  46215 words in vocabulary


In [16]:
models = {'Logistic Regression' : LogisticRegression(max_iter = 500), 'Linear SVC' : LinearSVC(max_iter = 10000),
          'Decision Tree' : DecisionTreeClassifier(), 'Gradient Descent' : SGDClassifier()}

In [17]:
for model in models.keys():
    print ("Training", model)
    models[model].fit(bow.toarray(), training_data['Genre'])

Training Logistic Regression
Training Linear SVC




Training Decision Tree
Training Gradient Descent


In [18]:
test_data = pd.concat([pop_songs.iloc[SONGS_PER_TRAINING:SONGS_PER_TRAINING+SONGS_PER_TESTING], rap_songs.iloc[SONGS_PER_TRAINING:SONGS_PER_TRAINING+SONGS_PER_TESTING], metal_songs.iloc[SONGS_PER_TRAINING:SONGS_PER_TRAINING+SONGS_PER_TESTING]])
test_bow = cv.transform(test_data['Lyric'].values.astype('U'))

In [19]:
accuracy = {}
pg = test_data[['Artist', 'Song','Genre']]
for model in models.keys():
    print("Evaluating", model)
    pred_genre = models[model].predict(test_bow.toarray())
    pg[model] = pred_genre
    accuracy[model] = accuracy_score(test_data['Genre'], pred_genre)
print ("---Accuracy Scores---")
print(accuracy)

Evaluating Logistic Regression
Evaluating Linear SVC
Evaluating Decision Tree
Evaluating Gradient Descent
---Accuracy Scores---
{'Logistic Regression': 0.8083333333333333, 'Linear SVC': 0.7866666666666666, 'Decision Tree': 0.6966666666666667, 'Gradient Descent': 0.8}


In [20]:
import statistics
def pred(x):
    user_input = x
    user_bow = cv.transform([cleanse(user_input)]) 
    arr = []
    for model in models.keys():
        pred = models[model].predict(user_bow)[0]
        arr.append(pred)
    print(arr)
    print(statistics.mode(arr))

In [21]:
pred(input())

To be young and in love in New York City (in New York City) To not know who I am but still know that I'm good long as you're here with me To be drunk and in love in New York City (in New York City) Midnight into morning coffee Burning through the hours talking Damn, I like me better when I'm with you I like me better when I'm with you I knew from the first time, I'd stay for a long time 'cause I like me better when I like me better when I'm with you I don't know what it is but I got that feeling (got that feeling) Waking up in this bed next to you swear the room Yeah, it got no ceiling If we lay, let the day just pass us by I might get to too much talking I might have to tell you something
['Pop', 'Pop', 'Pop', 'Pop']
Pop


In [22]:
import pickle
pickle.dump(models, open("GnereClassificationModel.pkl", "wb"))
pickle.dump(cv, open("GenreClassificationCV.pkl", "wb"))