In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay

# Naive Bayes Classifier for Determining Whether a Song is Explicit

Our goal is to create a Naive Bayes Classifier model for determining whether a song is explicit given its lyrics. Our data was obtained using the Musixmatch API. 

We got the top chart songs from several English speaking countries. Then, we got the lyrics for each of those songs and whether the song is explicit. We then wrote that data to a .csv file.

First, we read in the data. Then, we stem it and remove stop words to make things more simple for our classifier.

In [None]:
tracks = pd.read_csv("data/tracks.csv",index_col=0)
tracks.dropna(inplace=True)
tracks.reset_index(inplace=True)
del tracks["index"]

# Stemming words and removing stop words
nltk.download('stopwords')
stemmer = PorterStemmer()

stemmedLyricsList = []
for lyrics in tracks['lyrics']:
    lyricsList = lyrics.split(" ")
    stemmedLyrics = [stemmer.stem(word) for word in lyricsList if word.lower() not in stopwords.words('english')]
    stemmedLyrics = ' '.join(stemmedLyrics)
    stemmedLyricsList.append(stemmedLyrics)
tracks = tracks.assign(stemmed_lyrics=stemmedLyricsList)
tracks


Next, we define our input features and output classes. This will be our stemmed lyrics and the explicit flag for each song.

In [None]:
# Define Input Features and Output Classes
X = tracks['stemmed_lyrics']
y = tracks['explicit']

Now, we make a function for our Naive Bayes Classifier

In [None]:
# Naive Bayes Classifier Function

def naive_bayes(X,y, showConfusionMatrix=False):
    # Getting training and testing X and y
    # Convert text to numerical features
    vectorizer = CountVectorizer()
    Xvec = vectorizer.fit_transform(X)

    # Split into training and testing
    trainX, testX, trainY, testY = train_test_split(Xvec,y)

    # Train the classifier
    classifier = MultinomialNB()
    classifier.fit(trainX,trainY)

    # Predict the test data
    predictY = classifier.predict(testX)

    # Get metrics for classifier (precision, recall, fscore, support)
    metrics = {}
    p,r,f,s = precision_recall_fscore_support(testY,predictY)

    metrics["precision"] = p
    metrics["recall"] = r
    metrics["f-score"] = f
    metrics["support"] = s

    for metric in metrics.keys():
        print(f"{metric}: {metrics[metric]}")

    # Plot confusion matrix
    if showConfusionMatrix:
        labels = ["Not Explicit", "Explicit"]
        confusionMatrix = confusion_matrix(predictY,testY)
        display = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=labels)
        display.plot()

    return classifier, metrics

Run the function to obtain the classifier and the metrics

In [None]:
# Run naive bayes function
classifier, metrics = naive_bayes(X,y)

Use 10-fold cross validation to evaluate our classifier:

In [None]:
kValue = 10
kfold = KFold(kValue,shuffle=True)
metrics_averages = {"precision": 0, "recall": 0, "f-score": 0, "support": 0}
for train, test in kfold.split(X,y):
    classifier, metrics = naive_bayes(X,y)
    for metric in metrics.keys():
        metrics_averages[metric] += metrics[metric]
for metric in metrics_averages.keys():
    metrics_averages[metric] = metrics_averages[metric]/kValue

for metric in metrics.keys():
    print(f"average {metric}: {metrics_averages[metric]}")