In [6]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from MyImplementations.SoftmaxRegression import SoftmaxRegression
from utils import (
    music_genre_clean_up,
    divide_dataframe,
    get_preprocessor
)

df = pd.read_csv('../data/music_genre.csv')
music_genre_clean_up(df)

df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531,Electronic
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330,Hip-Hop
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113,Hip-Hop
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395,Hip-Hop
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354,Hip-Hop


In [7]:
TARGET = 'music_genre'

X, y, num, cat = divide_dataframe(df, TARGET)

print(num)
print(cat)

['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
['key', 'mode']


In [8]:
preprocessor = get_preprocessor(num, cat)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42
)

In [10]:
from sklearn.metrics import accuracy_score

regularization_methods = [None, 'l1', 'l2']

for i in range(len(regularization_methods)):
    print(f'{regularization_methods[i]} regularization')
    print('='*15)

    build_in = Pipeline([
        ('preprocessor', preprocessor),
        ('model', SGDClassifier(penalty=regularization_methods[i], alpha=0.01, max_iter=100))
    ])
    build_in.fit(X_train, y_train)

    train_pred = build_in.predict(X_train)
    val_pred = build_in.predict(X_val)

    print("Build in:")
    print(f'Train accuracy: {accuracy_score(y_train, train_pred)}')
    print(f'Validation accuracy: {accuracy_score(y_val, val_pred)}')

    print('='*15)

    my = Pipeline([
        ('preprocessor', preprocessor),
        ('model', SoftmaxRegression(penalty=regularization_methods[i]))
    ])
    my.fit(X_train, y_train)

    train_pred = my.predict(X_train)
    val_pred = my.predict(X_val)

    print("My implementation:")
    print(f'Train accuracy: {accuracy_score(y_train, train_pred)}')
    print(f'Validation accuracy: {accuracy_score(y_val, val_pred)}')

    print('='*30)



None regularization
Build in:
Train accuracy: 0.474075
Validation accuracy: 0.4782
My implementation:
Train accuracy: 0.532625
Validation accuracy: 0.5358
l1 regularization
Build in:
Train accuracy: 0.387575
Validation accuracy: 0.381
My implementation:
Train accuracy: 0.5036
Validation accuracy: 0.506
l2 regularization
Build in:
Train accuracy: 0.435225
Validation accuracy: 0.4356
My implementation:
Train accuracy: 0.51525
Validation accuracy: 0.5152
