In [7]:
# The goal of this notebook is to lay the foundation
# for two types of supervised machine learning classifiers.
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np



In [None]:
## This pipeline starts from the point we have a pandas dataframe containing song attributes and genre
## step 1: break the data into training and test data
## step 2: generate a classifier using the training data
## step 3: test the performance using the test data

In [13]:
## Step 1: Break the data into a training and a test set
def generate_train_test(tracks_df, random_val=42, split_ratio=0.8):
    """
    inputs:
        a dataframe containing song attributes and genre.
        random val for repeatability
        split_ratio = decimal pct of samples to use for training.
    returns:
        two dataframes train_df and test_df
    """
    # step 1 shuffle the df
    temp_df = tracks_df.sample(random_state=random_val, frac=1.0)
    # establish a number to split the frame at.
    num_train_samples = int(split_ratio*len(tracks_df))
    # split the DF into two sets train and test
    return np.split(temp_df, [num_train_samples])


In [30]:
# Step 2: train a classifier using the training set

def train_logistic_regression(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    # Step 2 create the classifier
    clf = LogisticRegression(random_state=random_val, solver = 'lbfgs', multi_class='auto', max_iter=5500)
    
    # fit the classifier
    return clf.fit(X, y)

In [5]:
# Step 3: train a random forest classifier
def train_random_forest(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    # Step 2 create the classifier
    clf = RandomForestClassifier(random_state=random_val, n_estimators=50, max_depth=15)
    
    # fit the classifier
    return clf.fit(X, y)

In [18]:
# Step 4: test the classifiers

def test_regression_model(clf, test_df):
    """
    inputs:
        testing set
        clf
    returns:
        f1 score
    """
    # step 1 create an X_test and y_test
    X_test = test_df[[col for col in test_df.columns if col != 'genre']]
    y_test = test_df['genre']

    # step 2 predict the genre for the test set
    y_pred = clf.predict(X_test)

    # step 3 calculate the average F1 score for all classes
    return f1_score(y_test, y_pred, average='macro')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=659c715d-e2b5-478e-9116-4d32a5174810' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>

In [34]:
# Create a dummy classifier for comparison

def create_dummy(train_df, random_val=42):
    """
    Train a uniform dummy classifier for performance evaluation
    """
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    random_clf = DummyClassifier(strategy='uniform', random_state=random_val)
    random_clf.fit(X, y)
    return random_clf

In [33]:
# development cell for integrating all functions together.
raw_df = pd.read_csv('development_data.csv')
train, test = generate_train_test(raw_df, 42, 0.8)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
dum_clf = create_dummy(train)
lr_score = test_regression_model(lr_clf, test)
rf_score = test_regression_model(rf_clf, test)
dum_score = test_regression_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'dummy classifier: {dum_score}')

NameError: name 'create_dummy' is not defined