In [1]:
# Importando bibliotecas
import pandas as pd
import re
import nltk
from itertools import chain
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Importando Datasets
artists = pd.read_csv('Database/artists-data.csv')
lyrics = pd.read_csv('Database/lyrics-data.csv')

In [3]:
# Dataset artists
artists.head()

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/


In [4]:
# Dataset lyrics
lyrics.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


In [5]:
# Gêneros únicos
artists.Genres.unique()

array(['Pop; Axé; Romântico', 'Axé', 'Axé; Romântico; Reggae', ...,
       'World Music; Indie; New Age', 'World Music; Gospel/Religioso',
       'World Music; Black Music; Blues'], dtype=object)

In [6]:
# Pegando lista única de gêneros
genres_list = [str(s).split(';') for s in artists.Genres.unique()]
res = list(chain(*genres_list))
res = [gen.strip() for gen in res]
genres = list(set(res))
print(genres)

['Soul Music', 'Pós-Punk', 'Músicas Gaúchas', 'Pop/Punk', 'Funk Carioca', 'Punk Rock', 'Black Music', 'Hip Hop', 'House', 'Jazz', 'Clássico', 'Chillout', 'Infantil', 'Disco', 'Pagode', 'Kizomba', 'Rock Alternativo', 'Soft Rock', 'Piano Rock', 'Trap', 'Trip-Hop', 'Tropical House', 'Rockabilly', 'Electronica', 'World Music', 'Folk', 'COLETÂNEA', 'Romântico', 'Jovem Guarda', 'Electro Swing', 'Grunge', 'Pop/Rock', 'Bossa Nova', 'Samba', 'Surf Music', 'Power-Pop', 'Psicodelia', 'J-Pop/J-Rock', 'Gospel/Religioso', 'Samba Enredo', 'Funk', 'Emocore', 'Urban', 'Axé', 'Gótico', 'Post-Rock', 'Reggae', 'Dance', 'Country', 'Blues', 'Forró', 'Metal', 'Pop', 'New Wave', 'Ska', 'Classic Rock', 'MPB', 'R&B', 'Trance', 'Industrial', 'Sertanejo', 'Indie', 'Hard Rock', 'Progressivo', 'Regional', 'Heavy Metal', 'Hardcore', 'New Age', 'Piseiro', 'Rock', 'nan', 'Instrumental', 'Rap', 'Velha Guarda', 'K-Pop/K-Rock', 'Lo-fi', 'Reggaeton', 'Tecnopop', 'Trilha Sonora', 'Fado']


In [7]:
# Marcando na tabela à qual gênero pertence cada letra
for i in genres:
    contains = [True if re.search(i, str(art_gen)) else False for art_gen in artists.Genres]
    artists[i] = contains
    
artists.head()

Unnamed: 0,Artist,Genres,Songs,Popularity,Link,Soul Music,Pós-Punk,Músicas Gaúchas,Pop/Punk,Funk Carioca,...,nan,Instrumental,Rap,Velha Guarda,K-Pop/K-Rock,Lo-fi,Reggaeton,Tecnopop,Trilha Sonora,Fado
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Lista de artistas Pop
country_artists = artists[artists['Pop']==True]['Link'].unique()

# Dataset de músicas em inglês
en_songs = lyrics[lyrics['language']=='en']

# Unindo dataset de artistas com músicas em inglês
all_songs = en_songs.merge(artists, how='outer', left_on='ALink', right_on='Link')

# Retirando uma amostra para balancear dataset
no_pop = all_songs[all_songs['Pop']==False].sample(n=5000, random_state=123)
yes_pop = all_songs[all_songs['Pop']==True].sample(n=5000, random_state=123)

# Unindo os 2 datasets com target balanceada
all_songs = pd.concat([no_pop, yes_pop]).reset_index()
all_songs.head()

Unnamed: 0,index,ALink,SName,SLink,Lyric,language,Artist,Genres,Songs,Popularity,...,nan,Instrumental,Rap,Velha Guarda,K-Pop/K-Rock,Lo-fi,Reggaeton,Tecnopop,Trilha Sonora,Fado
0,70518,/ozzy-osbourne/,Scary Little Green Men,/ozzy-osbourne/scary-little-green-men.html,Their colors are blinding me again (I don't be...,en,Ozzy Osbourne,Rock; Hard Rock; Heavy Metal,188.0,4.4,...,False,False,False,False,False,False,False,False,False,False
1,169081,/steel-pulse/,Worth His Weight In Gold (rally Round),/steel-pulse/worth-his-weight-in-gold-rally-ro...,CHORUS\n------\n\nRally round the flag\nRally ...,en,Steel Pulse,Reggae,120.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,96303,/ja-rule/,Murder Reigns,/ja-rule/murder-reigns.html,Yea {*echoes*}\nI feel the rain comin down on ...,en,Ja Rule,Hip Hop; Rap; Black Music,168.0,4.3,...,False,False,True,False,False,False,False,False,False,False
3,16804,/john-mayalls-bluesbreakers/,Trenches,/john-mayalls-bluesbreakers/trenches.html,In the middle of summer nineteen fourteen\nThe...,en,John Mayall,Blues,53.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,25754,/hank-williams/,My Bucket's Got A Hole In It,/hank-williams/my-buckets-got-a-hole-in-it-2.html,Yeah my bucket's got a hole in it yeah my buck...,en,Hank Williams,Country,165.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# Pegando todas as músicas
lyrics = all_songs['Lyric'].astype(str)

# Colocando as músicas em minúsculo
low = [i.lower() for i in lyrics]

In [10]:
# Tokenizando as músicas
tokenized = [word_tokenize(i) for i in low]

In [11]:
# Retirando as Stopwords
stopwords = set(stopwords.words('english'))
stop_vec = []
for i in tokenized:
    filtered_stop = [w for w in i if w not in stopwords]
    stop_vec.append(filtered_stop)

In [12]:
# Fazendo limpeza somente para palavras
clean_vec = []
for i in stop_vec:
    clean_lyr = [word for word in i if word.isalpha()]
    clean_vec.append(clean_lyr)

In [13]:
# Colocando palavras para a raiz
wnet = WordNetLemmatizer()
lem = []
for lyr in clean_vec:
    lem.append([wnet.lemmatize(w) for w in lyr])

In [14]:
# Criando matriz com as palavras
lyrics = []
for lyr in lem:
    lyr_text = ' '.join(lyr)
    lyrics.append(lyr_text)
    
vectorize = TfidfVectorizer(min_df=5, max_df=0.8)
vectors = vectorize.fit_transform(lyrics)
feature_names = vectorize.get_feature_names()
dense = vectors.todense()
dense_list = dense.tolist()
df = pd.DataFrame(dense_list, columns=feature_names)
df.head()

Unnamed: 0,aaah,aah,aaliyah,ab,abandon,abandoned,abc,abide,ability,ablaze,...,yup,zero,zevon,zion,zip,zombie,zone,zoo,zoom,zulu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07108,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Copiando as músicas Pop para o novo Dataframe
df['Pop'] = all_songs.Pop.copy()

In [16]:
# Pegando variável dependente e independente
X = df.drop('Pop', axis=1)
y = df.Pop

In [17]:
# Separar em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [18]:
# Criação do modelo
clf = MultinomialNB()
clf.fit(X_train, y_train.astype(bool))

MultinomialNB()

In [19]:
# varoável de predição
y_pred = clf.predict(X_test)

In [20]:
# Acurácia
print(metrics.accuracy_score(y_test.astype(bool), y_pred))

0.6283333333333333


In [22]:
# Matriz de confusão
print(metrics.confusion_matrix(y_test.astype(bool), y_pred))

[[ 700  785]
 [ 330 1185]]


In [23]:
# Reporte de classificação
print(metrics.classification_report(y_test.astype(bool), y_pred))

              precision    recall  f1-score   support

       False       0.68      0.47      0.56      1485
        True       0.60      0.78      0.68      1515

    accuracy                           0.63      3000
   macro avg       0.64      0.63      0.62      3000
weighted avg       0.64      0.63      0.62      3000

