In [1]:
import pandas as pd
from utils import (
    music_genre_clean_up,
    divide_dataframe,
    get_preprocessor
)

df = pd.read_csv('../data/music_genre.csv')
music_genre_clean_up(df)

df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531,Electronic
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330,Hip-Hop
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113,Hip-Hop
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395,Hip-Hop
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354,Hip-Hop


In [2]:
TARGET = 'music_genre'

X, y, num, cat = divide_dataframe(df, TARGET)

print(num)
print(cat)

['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
['key', 'mode']


In [3]:
preprocessor = get_preprocessor(num, cat)

## Cross Validation with StratifiedKFold
I will use my Logistic regression, Random Forest, SVC and K nearest neighbours

In [4]:
from sklearn.pipeline import Pipeline
from MyImplementations.SoftmaxRegression import SoftmaxRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

skf = StratifiedKFold(n_splits=3, shuffle=True)

In [5]:
rand_forest = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier())
])

In [6]:
svc = Pipeline([
    ('preprocessing', preprocessor),
    ('model', SVC())
])

In [7]:
knn = Pipeline([
    ('preprocessing', preprocessor),
    ('model', KNeighborsClassifier(n_neighbors=100))
])

In [8]:
my_log_reg = Pipeline([
    ('preprocessing', preprocessor),
    ('model', SoftmaxRegression())
])

In [9]:
log_reg = Pipeline([
    ('preprocessing', preprocessor),
    ('model', SGDClassifier())
])

In [10]:
print(f'RandomForest: {cross_val_score(rand_forest, X, y, cv=skf, scoring='accuracy')}')
print(f'SVC: {cross_val_score(svc, X, y, cv=skf, scoring='accuracy')}')
print(f'100 Nearest neighbours: {cross_val_score(knn, X, y, cv=skf, scoring='accuracy')}')
print(f'My logistic regression: {cross_val_score(my_log_reg, X, y, cv=skf, scoring='accuracy')}')
print(f'Build in logistic regression: {cross_val_score(log_reg, X, y, cv=skf, scoring='accuracy')}')

RandomForest: [0.54652907 0.54688906 0.55160206]
SVC: [0.57550849 0.57592848 0.57614305]
100 Nearest neighbours: [0.52342953 0.5302094  0.51650066]
My logistic regression: [0.48947021 0.48929021 0.48973959]
Build in logistic regression: [0.44915102 0.42857143 0.4374175 ]


### Key takeaways:
- accuracy is similar for every subset => the data set is well-balanced