In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Reading in the data

Combined_Lyrics = pd.read_csv('Combined_Lyrics.csv')

In [19]:
# Creating a list of stop words, creating a lemmatizer

sw = stopwords.words('english')
new_words = ['1embedshare','2embedshare','3embedshare','4embedshare','5embedshare']
sw.extend(new_words)
wn = WordNetLemmatizer()
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
# Vectorizing lyrics data

cv = CountVectorizer(stop_words=sw, min_df=50)
X_cv = cv.fit_transform(Combined_Lyrics['Lyrics'])

In [21]:
# Converting the vectorized lyrics object to a dataframe
Vectorized_df = pd.DataFrame(X_cv.toarray(), columns = cv.get_feature_names_out())
# Appending on Artist and Genre
Vectorized_df = pd.concat([Vectorized_df, Combined_Lyrics[['Artist','Genre']]], axis = 1)
# Reordering the columns
cols_to_move = ['Artist', 'Genre']
Vectorized_df = Vectorized_df[ cols_to_move + [ col for col in Vectorized_df.columns if col not in cols_to_move ] ]

In [22]:
# Tfid Vectorzing lyrics data

tf = TfidfVectorizer(max_df = 0.95, min_df = 50)
X_tf = tf.fit_transform(Combined_Lyrics['Lyrics'])

In [25]:
# Converting tfid lyrics to object to a dataframe
Tfid_df = pd.DataFrame(X_tf.toarray(), columns=tf.get_feature_names())
Tfid_df = pd.concat([Tfid_df, Combined_Lyrics[['Artist','Genre']]], axis = 1)
# Reordering the columns
cols_to_move = ['Artist', 'Genre']
Tfid_df = Tfid_df[ cols_to_move + [ col for col in Tfid_df.columns if col not in cols_to_move ] ]

In [26]:
Tfid_df

Unnamed: 0,Artist,Genre,10,100,11,12,1embedshare,20,24,2embedshare,...,yo,york,youembedshare,young,younger,youth,yuh,yup,zero,zone
0,AC/DC,Rock,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AC/DC,Rock,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AC/DC,Rock,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AC/DC,Rock,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AC/DC,Rock,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12029,Whitney Houston,RnB,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12030,Whitney Houston,RnB,0.0,0.0,0.0,0.0,0.025165,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12031,Whitney Houston,RnB,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12032,Whitney Houston,RnB,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.06064,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
text = Tfid_df.drop(columns = 'Artist')
X = text.drop(columns = 'Genre')
y = text['Genre']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=61221, test_size=.4)

In [51]:
# Import Packages

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, f1_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib.pyplot import figure

# Naive Bayes

In [52]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB()

In [53]:
y_hat_nb = nb.predict(X_test)
y_prob_nb = nb.predict_proba(X_test)[:,1]

In [72]:
matrix = confusion_matrix(y_test, y_hat_nb)
matrix.diagonal()/matrix.sum(axis=0)

[[ 599    1   28  105  235]
 [  63   15   44  170  228]
 [  13    0  944   53   50]
 [  41    2   61  498  353]
 [  92    4   22  158 1035]]


array([0.74133663, 0.68181818, 0.85896269, 0.50609756, 0.54445029])

In [55]:
metrics.accuracy_score(y_test, y_hat_nb)

0.642085583714167

In [58]:
f1_score(y_test, y_hat_nb, average = 'weighted')

0.6115767657988109

In [69]:
metrics.recall_score(y_test, y_hat_nb, average = 'weighted')

0.642085583714167

In [62]:
y_hat_nb

array(['Country', 'Country', 'Rock', ..., 'Rap', 'Country', 'RnB'],
      dtype='<U7')

In [63]:
y_prob_nb

array([0.11920377, 0.1017145 , 0.05562252, ..., 0.04806196, 0.1251764 ,
       0.06777121])

Unnamed: 0,10,100,11,12,1embedshare,20,24,2embedshare,30,3embedshare,...,yo,york,youembedshare,young,younger,youth,yuh,yup,zero,zone
3717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
7502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038430,0.0,0.0,0.033527,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
10415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
7528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.062412,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
Tfid_df.columns[1513]

'nerve'