In [1]:
import pandas as pd

from SoftmaxRegression import SoftmaxRegression
from utils import (
    music_genre_clean_up,
    divide_dataframe,
    get_preprocessor
)

df = pd.read_csv('data/music_genre.csv')
music_genre_clean_up(df)

df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531,Electronic
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330,Hip-Hop
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113,Hip-Hop
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395,Hip-Hop
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354,Hip-Hop


In [2]:
TARGET = 'music_genre'

X, y = divide_dataframe(df, TARGET)

num = list(X.select_dtypes(exclude=object).columns)
cat = list(X.select_dtypes(include=object).columns)

print(num)
print(cat)

['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
['key', 'mode']


In [3]:
preprocessor = get_preprocessor(num, cat)
preprocessor

## Cross Validation with StratifiedKFold
I will use my Logistic regression, Random Forest, SVC and K nearest neighbours

In [4]:
from sklearn.pipeline import Pipeline
from SoftmaxRegression import SoftmaxRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

skf = StratifiedKFold(n_splits=3, shuffle=True)

In [5]:
rand_forest = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier())
])
cross_val_score(rand_forest, X, y, cv=skf, scoring='accuracy')

array([0.54442911, 0.549889  , 0.54710188])

In [6]:
svc = Pipeline([
    ('preprocessing', preprocessor),
    ('model', SVC())
])
cross_val_score(svc, X, y, cv=skf, scoring='accuracy')

array([0.5749085 , 0.57238855, 0.57710308])

In [7]:
knn = Pipeline([
    ('preprocessing', preprocessor),
    ('model', KNeighborsClassifier(n_neighbors=100))
])
cross_val_score(knn, X, y, cv=skf, scoring='accuracy')

array([0.52828943, 0.51790964, 0.52616105])

In [8]:
my_log_reg = Pipeline([
    ('preprocessing', preprocessor),
    ('model', SoftmaxRegression())
])
cross_val_score(my_log_reg, X, y, cv=skf, scoring='accuracy')

array([0.49379012, 0.49163017, 0.48709948])

### Key takeaways:
- accuracy is similar for every subset => the data set is well-balanced

In [25]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDClassifier

pf = PolynomialFeatures(degree=2)
log_reg = Pipeline([
    ('preprocessing', preprocessor),
    ('polynomial_features', pf),
    ('model', SGDClassifier(random_state=42))
])
cross_val_score(log_reg, X, y, cv=skf, scoring='accuracy')

array([0.50632987, 0.49787004, 0.48955958])

- No polynomial features: ~ array([0.45755085, 0.44945101, 0.45073803]) - a little underfitting
- Polynomial features degree 2: ~ array([0.50290994, 0.48539029, 0.48103924]) - spot on
- Polynomial features degree 3: ~ array([0.42245155, 0.46115078, 0.4149766 ])
- Polynomial features degree 4: - too long to compute

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42
)

log_reg.fit(X_train, y_train)

In [27]:
from sklearn.metrics import accuracy_score
train_pred = log_reg.predict(X_train)
val_pred = log_reg.predict(X_val)

print(f'Train accuracy: {accuracy_score(y_train, train_pred)}')
print(f'Validation accuracy: {accuracy_score(y_val, val_pred)}')

Train accuracy: 0.4878
Validation accuracy: 0.4782


## L1 and L2 regularization

In [28]:
log_reg_l1  = Pipeline([
    ('preprocessing', preprocessor),
    ('polynomial_features', pf),
    ('model', SGDClassifier(random_state=42, penalty='l1'))
])

log_reg_l1.fit(X_train, y_train)

In [29]:
train_pred_l1 = log_reg_l1.predict(X_train)
val_pred_l1 = log_reg_l1.predict(X_val)

print(f'Train accuracy: {accuracy_score(y_train, train_pred_l1)}')
print(f'Validation accuracy: {accuracy_score(y_val, val_pred_l1)}')

Train accuracy: 0.528775
Validation accuracy: 0.5128


In [32]:
log_reg_l2  = Pipeline([
    ('preprocessing', preprocessor),
    ('polynomial_features', pf),
    ('model', SGDClassifier(random_state=42, penalty='l2'))
])

log_reg_l2.fit(X_train, y_train)

In [33]:
train_pred_l2 = log_reg_l2.predict(X_train)
val_pred_l2 = log_reg_l2.predict(X_val)

print(f'Train accuracy: {accuracy_score(y_train, train_pred_l2)}')
print(f'Validation accuracy: {accuracy_score(y_val, val_pred_l2)}')

Train accuracy: 0.4878
Validation accuracy: 0.4782
