In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

In [16]:
# Rescale column between range -1 & 1 using Maximum Absolute Scaling
def max_absolute_scaling(df):
    for col in df.columns:
        try:
            df[col] = df[col] / df[col].abs().max()
        except TypeError:
            pass
    return df

def cleaning(df):
    # Drop columns
    drop_columns = ['Unnamed: 0','track_id','artists','album_name','track_name','explicit','key','mode','time_signature','popularity']
    df.drop(columns=drop_columns,axis=1,inplace=True)
    # Scale df
    df = max_absolute_scaling(df)
    return df

In [17]:
df = pd.read_csv(Path('../data/dataset.csv'))
df = cleaning(df)
df

Unnamed: 0,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre
0,0.044043,0.686294,0.4610,-0.136198,0.148187,0.032329,0.000001,0.3580,0.718593,0.361245,acoustic
1,0.028566,0.426396,0.1660,-0.347964,0.079067,0.927711,0.000006,0.1010,0.268342,0.318397,acoustic
2,0.040255,0.444670,0.3590,-0.196523,0.057720,0.210843,0.000000,0.1170,0.120603,0.313643,acoustic
3,0.038557,0.270051,0.0596,-0.373806,0.037617,0.908635,0.000071,0.1320,0.143719,0.746758,acoustic
4,0.037969,0.627411,0.4430,-0.195453,0.054508,0.470884,0.000000,0.0829,0.167839,0.492863,acoustic
...,...,...,...,...,...,...,...,...,...,...,...
113995,0.073511,0.174619,0.2350,-0.330964,0.043731,0.642570,0.928000,0.0863,0.034070,0.517705,world-music
113996,0.073511,0.176650,0.1170,-0.369829,0.041554,0.997992,0.976000,0.1050,0.035176,0.350242,world-music
113997,0.051833,0.638579,0.3290,-0.219963,0.043523,0.870482,0.000000,0.0839,0.746734,0.543933,world-music
113998,0.054206,0.595939,0.5060,-0.219842,0.030777,0.382530,0.000000,0.2700,0.415075,0.558651,world-music


In [5]:
X = df.drop(columns=['track_genre'],axis=1)
y = df['track_genre']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

model_pipeline = []
model_pipeline.append(LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=200))
model_pipeline.append(KNeighborsClassifier())
model_pipeline.append(DecisionTreeClassifier())
model_pipeline.append(GaussianNB())
model_pipeline.append(RandomForestClassifier())

for model in model_pipeline:
    kfold = StratifiedKFold(n_splits=3, random_state=100, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    print(f'{model}\t\t\tScore: {cv_results.mean()}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=200, multi_class='multinomial')			Score: 0.17985745614035087
KNeighborsClassifier()			Score: 0.19555921052631578
DecisionTreeClassifier()			Score: 0.21222587719298247
GaussianNB()			Score: 0.16483552631578946
RandomForestClassifier()			Score: 0.32528508771929826


Randomforest performed the best. Going to use Randomized Search to search for optimal 'best' parameters

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Number of trees in random forest
n_estimators = [100,200,300,400]
# Maximum number of levels in tree
max_depth = [10,20,30,40]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, n_iter=20 ,param_distributions = random_grid, cv = 3, verbose=2, random_state=42, n_jobs = 1)
rf_random.fit(X_train, Y_train)
print(rf_random.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  51.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  51.6s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  51.9s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  36.3s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  35.9s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  36.5s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  45.1s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  4

In [8]:
def evaluate(model):
    y_pred = model.predict(X_test)
    
    accuracy = metrics.accuracy_score(Y_test,y_pred)

    print('Model Performance: ',model)
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('Precision = ',metrics.precision_score(Y_test,y_pred,average='micro'))
    print('Recall = ',metrics.recall_score(Y_test,y_pred,average='micro'))
    print('F1 Score = ',metrics.f1_score(Y_test,y_pred,average='micro'))

    return accuracy

base_model = RandomForestClassifier()
base_model.fit(X_train,Y_train)
base_accuracy = evaluate(base_model)

rf_random_accuracy = evaluate(rf_random.best_estimator_)

print('Improvement of {:0.2f}%.'.format( 100 * (rf_random_accuracy - base_accuracy) / base_accuracy))

Model Performance:  RandomForestClassifier()
Accuracy = 0.32%.
Precision =  0.31877192982456143
Recall =  0.31877192982456143
F1 Score =  0.31877192982456143
Model Performance:  RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_split=5,
                       n_estimators=300)
Accuracy = 0.33%.
Precision =  0.3279385964912281
Recall =  0.3279385964912281
F1 Score =  0.3279385964912281
Improvement of 2.88%.


In [11]:
from sklearn.metrics import accuracy_score

def evaluate(model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test,y_pred)

    print('Model Performance: ',model)
    print('Accuracy = {:0.2f}%.'.format(accuracy))

    return accuracy

In [18]:
base_model = RandomForestClassifier(n_estimators= 300, min_samples_split= 5, min_samples_leaf= 1, max_depth= 30, bootstrap= False)
base_model.fit(X_train,Y_train)
base_accuracy = evaluate(base_model)

Model Performance:  RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_split=5,
                       n_estimators=300)
Accuracy = 0.33%.
Precision =  0.3275438596491228
Recall =  0.3275438596491228
F1 Score =  0.3275438596491228
