In [1]:
# The goal of this notebook is to lay the foundation
# for two types of supervised machine learning classifiers.
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np



In [2]:
## This pipeline starts from the point we have a pandas dataframe containing song attributes and genre
## step 1: break the data into training and test data
## step 2: generate a classifier using the training data
## step 3: test the performance using the test data

In [3]:
## Step 1: Break the data into a training and a test set
def generate_train_test(tracks_df, random_val=42, split_ratio=0.8):
    """
    inputs:
        a dataframe containing song attributes and genre.
        random val for repeatability
        split_ratio = decimal pct of samples to use for training.
    returns:
        two dataframes train_df and test_df
    """
    # step 1 shuffle the df
    temp_df = tracks_df.sample(random_state=random_val, frac=1.0)
    # establish a number to split the frame at.
    num_train_samples = int(split_ratio*len(tracks_df))
    # split the DF into two sets train and test
    return np.splt(temp_df, [num_train_samples])


In [4]:
# Step 2: train a classifier using the training set

def train_logistic_regression(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    # Step 2 create the classifier
    clf = LogisticRegression(random_state=random_val, solver = 'lbfgs', multi_class='auto')
    
    # fit the classifier
    return clf.fit(X, y)

In [5]:
# Step 3: test the classifier

def test_logistic_regression(clf, test_df):
    """
    inputs:
        testing set
        clf
    returns:
        f1 score
    """
    # step 1 create an X_test and y_test
    X_test = test_df[[col for col in train_df.columns if col != 'genre']]
    y_test = test_df['genre']

    # step 2 predict the genre for the test set
    y_pred = clf.predict(X_test)

    # step 3 calculate the average F1 score for all classes
    return f1_score(y_test, y_pred, average='macro')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=659c715d-e2b5-478e-9116-4d32a5174810' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>