# Predição de gênero musical
## Adequar os dados de texto ao formato de entrada no modelo
### Importar ferramentas e carregar dados

In [24]:
import re
import numpy as np
import pandas as pd
from itertools import chain
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
artists = pd.read_csv('artists-data.csv')
lyrics = pd.read_csv('lyrics-data.csv')

In [3]:
artists.head()

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/


In [4]:
lyrics.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


In [5]:
artists['Genres'].unique()

array(['Pop; Axé; Romântico', 'Axé', 'Axé; Romântico; Reggae', ...,
       'World Music; Indie; New Age', 'World Music; Gospel/Religioso',
       'World Music; Black Music; Blues'], dtype=object)

In [6]:
genres_list = [str(s).split(';') for s in artists['Genres'].unique()] #criar lista de listas com os valores das colunas ex. [rock, axe, pop]
res = list(chain(*genres_list))#juntar as sublistas
res = [gen.strip() for gen in res]#tirar os espaços no começo e final das strings
genres = list(set(res))#valores "planificados" listados
print(genres)

['Trip-Hop', 'Piseiro', 'Punk Rock', 'Infantil', 'New Age', 'Rockabilly', 'Pop', 'Pop/Punk', 'Tecnopop', 'Ska', 'Lo-fi', 'Forró', 'Funk', 'Kizomba', 'Folk', 'MPB', 'Romântico', 'Rock', 'Sertanejo', 'Trance', 'Disco', 'New Wave', 'Classic Rock', 'Axé', 'Electronica', 'nan', 'Fado', 'Hip Hop', 'Blues', 'Power-Pop', 'Black Music', 'Soft Rock', 'Progressivo', 'Gótico', 'Chillout', 'Heavy Metal', 'Indie', 'Dance', 'Surf Music', 'Emocore', 'Músicas Gaúchas', 'Post-Rock', 'Trap', 'Jazz', 'Hardcore', 'Samba Enredo', 'Grunge', 'J-Pop/J-Rock', 'Instrumental', 'Pop/Rock', 'Velha Guarda', 'K-Pop/K-Rock', 'Pagode', 'Regional', 'Jovem Guarda', 'Gospel/Religioso', 'Reggaeton', 'Funk Carioca', 'Pós-Punk', 'Country', 'Clássico', 'Industrial', 'Samba', 'Urban', 'Piano Rock', 'Reggae', 'Psicodelia', 'Electro Swing', 'Hard Rock', 'World Music', 'Soul Music', 'COLETÂNEA', 'Rock Alternativo', 'Trilha Sonora', 'R&B', 'Rap', 'Bossa Nova', 'House', 'Tropical House', 'Metal']


In [7]:
#transformar presença/ausencia de determinado genero na linha do artista em valores booleanos, concatenar ao dataframe artistas
for genre in genres:
    all_genres_bool = [True if re.search(genre, str(art_gen)) else False for art_gen in artists['Genres']]
    artists[genre] = all_genres_bool

In [8]:
artists.head()

Unnamed: 0,Artist,Genres,Songs,Popularity,Link,Trip-Hop,Piseiro,Punk Rock,Infantil,New Age,...,Soul Music,COLETÂNEA,Rock Alternativo,Trilha Sonora,R&B,Rap,Bossa Nova,House,Tropical House,Metal
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Trabalhar só com as músicas em inglês e do tipo Country:

In [9]:
country_artists = artists[artists['Country'] == True]['Link'].unique()
en_songs = lyrics[lyrics['language']=='en']

In [10]:
all_songs = en_songs.merge(artists, how = 'outer', left_on='ALink', right_on = 'Link')

In [11]:
all_songs.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language,Artist,Genres,Songs,Popularity,Link,...,Soul Music,COLETÂNEA,Rock Alternativo,Trilha Sonora,R&B,Rap,Bossa Nova,House,Tropical House,Metal
0,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/,...,False,False,False,False,False,False,False,False,False,False
1,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,"Don't let them fool, ya\nOr even try to school...",en,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/,...,False,False,False,False,False,False,False,False,False,False
2,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,"Baby, let's cruise, away from here\nDon't be c...",en,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/,...,False,False,False,False,False,False,False,False,False,False
3,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",en,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/,...,False,False,False,False,False,False,False,False,False,False
4,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again\nThe one I hoped I ...,en,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/,...,False,False,False,False,False,False,False,False,False,False


### Selecionar subconjunto do total de artistas:

In [12]:
#formar o dataset para o modelo
no_country = all_songs[all_songs['Country']==False].sample(n = 2500, random_state = 1)
country = all_songs[all_songs['Country']==True].sample(n = 2500, random_state = 1)

In [13]:
all_songs = pd.concat([no_country,country]).reset_index()
all_songs.head()

Unnamed: 0,index,ALink,SName,SLink,Lyric,language,Artist,Genres,Songs,Popularity,...,Soul Music,COLETÂNEA,Rock Alternativo,Trilha Sonora,R&B,Rap,Bossa Nova,House,Tropical House,Metal
0,105195,/foster-the-people/,Houdini (RAC Remix),/foster-the-people/houdini-rac-remix.html,"Rise above, gonna start the war\nOh, what you ...",en,Foster The People,Indie; Rock Alternativo,66.0,10.8,...,False,False,True,False,False,False,False,False,False,False
1,152794,/magazine/,Twenty Years Ago,/magazine/twenty-years-ago.html,You turn pandemonium\ninto pantomime for one\n...,en,Magazine (UK),Pós-Punk,50.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,104322,/lana-del-rey/,Pretty When You Cry,/lana-del-rey/pretty-when-you-cry.html,"All the pretty stars shine for you, my love\nA...",en,Lana Del Rey,Romântico; Indie,311.0,53.3,...,False,False,False,False,False,False,False,False,False,False
3,15543,/gary-moore/,Speak for Yourself,/gary-moore/speak-for-yourself.html,(Gary Moore/Neil Carter)\nLook around across t...,en,Gary Moore,Blues; Rock; Jazz,167.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,74615,/rainbow/,Magic,/rainbow/magic.html,There's a light in the sky that hangs on the e...,en,Rainbow,Hard Rock; Heavy Metal; Rock,75.0,0.0,...,False,False,False,False,False,False,False,False,False,True


### Tokenizar, remover stopwords e outras impurezas e vetorizar o dataset:

In [14]:
lyrics = all_songs['Lyric'].astype(str)
low =[lyr.lower() for lyr in lyrics]
tokenized = [word_tokenize(lyr) for lyr in low]

In [15]:
stop_words = set(stopwords.words('english'))
stop_vec = []
for tok in tokenized:
    filtered_stop = [w for w in tok if w not in stop_words]
    stop_vec.append(filtered_stop)

In [16]:
clean_vec = []
for lyr in stop_vec:
    clean_lyr = [word for word in lyr if word.isalpha()]
    clean_vec.append(clean_lyr)

In [17]:
lyrics = []
for lyr in clean_vec:
    lyr_text = ' '.join(lyr)
    lyrics.append(lyr_text)

vectorize = TfidfVectorizer(min_df = 5, max_df = 0.8)
vectors = vectorize.fit_transform(lyrics)
feature_names = vectorize.get_feature_names_out()
dense = vectors.todense()
dense_list = dense.tolist()
df = pd.DataFrame(dense_list, columns=feature_names)

In [18]:
df

Unnamed: 0,abandoned,abide,ability,able,aboard,abuse,accept,accident,account,accused,...,youth,yuh,yuletide,yup,zant,zero,zion,zip,zone,zoo
0,0.0,0.0,0.450677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df['Country'] = all_songs['Country'].copy()

In [20]:
all_songs['Country'].value_counts()

False    2500
True     2500
Name: Country, dtype: int64

In [22]:
y = df['Country'].copy().astype(bool)
X = df.drop('Country',1)
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.3, random_state = 42)

  X = df.drop('Country',1)


In [30]:
clf = MultinomialNB(alpha=.3)
clf.fit(X_train, y_train)
predictions = clf.predict(X_validation)
score = accuracy_score(y_validation, predictions)
score

0.7546666666666667

In [31]:
print(confusion_matrix(y_validation, predictions))
print(classification_report(y_validation, predictions))

[[537 231]
 [137 595]]
              precision    recall  f1-score   support

       False       0.80      0.70      0.74       768
        True       0.72      0.81      0.76       732

    accuracy                           0.75      1500
   macro avg       0.76      0.76      0.75      1500
weighted avg       0.76      0.75      0.75      1500

