## Notebook to compute the validation and test accuracy of the most common genre baseline.

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin

# Load data
df = pd.read_csv("scraped-lyrics-v2-preprocessed.csv")
df1 = df.drop(columns=["artist", "song", "lyrics"])

# Find most common genre
df1_grouped = df1.groupby(["category"], as_index=False).count()
df1_most_common_genre = df1_grouped[df1_grouped.genres == df1_grouped.genres.max()]
most_common_genre = df1_most_common_genre.category.tolist()[0]
print(f"The most common genre is: {most_common_genre}")

The most common genre is: Country


## Create most common genre baseline

In [42]:
class MostCommonGenre(BaseEstimator, ClassifierMixin):
    """Custom classifier that predicts the most common genre."""

    def __init__(self, most_common_genre):
      self.mcg = most_common_genre

    def fit(self, X, y):
        return self

    def predict(self, X):
        return [self.mcg for x in X]

## Split the data into train-val-test subsets

In [45]:
lyrics = df.lyrics.tolist()
genres = df.category.tolist()


X_train, X_test, y_train, y_test = train_test_split(lyrics, 
                                                  genres, 
                                                  test_size=0.15)
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size=0.15)

print(f"Train size: {len(X_train)/len(lyrics)}, Val size: {len(X_val)/len(lyrics)}, Test size: {len(X_test)/len(lyrics)}")

Train size: 0.722491868049524, Val size: 0.12750557741105945, Test size: 0.15000255453941655


Compute the validation and test accuracy

In [49]:
mcg = MostCommonGenre(most_common_genre)
y_pred = mcg.predict(X_val)
val_acc = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {val_acc}")

y_pred = mcg.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {test_acc}")

Validation accuracy: 0.12621877921731
Test accuracy: 0.13521798365122617
