In [1]:
import pandas as pd
import numpy as np
from utils import music_genre_clean_up

SEED = 42

df = pd.read_csv('data/music_genre.csv')
music_genre_clean_up(df)

df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531,Electronic
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330,Hip-Hop
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113,Hip-Hop
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395,Hip-Hop
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354,Hip-Hop


## My implementation of Linear Regression closed form
![alt text](img.png "Regression")

In [2]:
from sklearn.model_selection import train_test_split

LR_TARGET = 'popularity'

X = df.drop(LR_TARGET, axis=1)
y = df[LR_TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

num = list(X.select_dtypes(exclude=object).columns)
cat = list(X.select_dtypes(include=object).columns)

print(num)
print(cat)

['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
['key', 'mode', 'music_genre']


In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

num_pl = Pipeline([
    ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=SEED)),
    ('scaler', StandardScaler())
])

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

linear_preprocessor = ColumnTransformer([
    ('num', num_pl, num),
    ('cat', OneHotEncoder(drop='if_binary'), cat)
])

In [5]:
from MyLinearRegression import MyLinearRegression

my_lr = Pipeline([
    ('preprocessing', linear_preprocessor),
    ('model', MyLinearRegression())
])

my_lr.fit(X_train, y_train)

In [6]:
from sklearn.metrics import mean_squared_error, r2_score

my_y_pred = my_lr.predict(X_test)

print(f"MSE: {mean_squared_error(y_test, my_y_pred)}")
print(f"R2: {r2_score(y_test, my_y_pred)}")

MSE: 91.66281355527504
R2: 0.6257400266124299


## Let's compare it to built in regression

In [7]:
from sklearn.linear_model import LinearRegression

build_in_lr = Pipeline([
    ('preprocessing', linear_preprocessor),
    ('model', LinearRegression())
])

build_in_lr.fit(X_train, y_train)

In [8]:
build_in_y_pred = build_in_lr.predict(X_test)

print(f"MSE: {mean_squared_error(y_test, build_in_y_pred)}")
print(f"R2: {r2_score(y_test, build_in_y_pred)}")

MSE: 91.66281355527504
R2: 0.6257400266124299


## Results

| Model          |        MSE        |      R<sup>2</sup> |
|:---------------|:-----------------:|-------------------:|
| My model       | 91.66281355527504 |  0.6257400266124299 |
| Build in model |         91.66281355527504          | 0.6257400266124299 |

### Cons of closed form regression:
- For huge data that don't fit in memory at once it is useless
- **X<sup>T</sup>X** can be non-invertible (formula no longer works)
- Not flexible

## Logistic Regression using softmax and gradient descent

In [9]:
X_mg = df.drop('music_genre', axis=1)
y_mg = df['music_genre']

X_train_mg, X_test_mg, y_train_mg, y_test_mg = train_test_split(
    X_mg, y_mg, test_size=0.2, random_state=SEED
)

num = list(X_mg.select_dtypes(exclude=object).columns)
cat = list(X_mg.select_dtypes(include=object).columns)

print(f"Numerical: {num}")
print(f"Categorical: {cat}")

Numerical: ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
Categorical: ['key', 'mode']


In [10]:
softmax_preprocessor = ColumnTransformer([
    ('num', num_pl, num),
    ('cat', OneHotEncoder(drop='if_binary'), cat)
])

### My implementation

In [11]:
from SoftmaxRegression import SoftmaxRegression

my_softmax = Pipeline([
    ('preprocessing', softmax_preprocessor),
    ('model', SoftmaxRegression())
])

my_softmax.fit(X_train_mg, y_train_mg)

In [12]:
from utils import evaluate

my_softmax_pred = evaluate(my_softmax, X_test_mg, y_test_mg)

              precision    recall  f1-score   support

 Alternative       0.38      0.30      0.33      1008
       Anime       0.61      0.62      0.62      1034
       Blues       0.54      0.48      0.50      1021
   Classical       0.77      0.80      0.78       955
     Country       0.44      0.59      0.50       986
  Electronic       0.59      0.59      0.59      1009
     Hip-Hop       0.46      0.48      0.47       995
        Jazz       0.48      0.42      0.45       985
         Rap       0.47      0.40      0.44      1030
        Rock       0.52      0.63      0.57       977

    accuracy                           0.53     10000
   macro avg       0.53      0.53      0.53     10000
weighted avg       0.53      0.53      0.52     10000

Accuracy: 0.5285


### Build in Implementation

In [13]:
from sklearn.linear_model import LogisticRegression

build_in_softmax = Pipeline([
    ('preprocessing', softmax_preprocessor),
    ('model', LogisticRegression())
])

build_in_softmax.fit(X_train_mg, y_train_mg)

In [14]:
build_in_softmax_pred = evaluate(build_in_softmax, X_test_mg, y_test_mg)

              precision    recall  f1-score   support

 Alternative       0.38      0.29      0.33      1008
       Anime       0.61      0.61      0.61      1034
       Blues       0.53      0.49      0.51      1021
   Classical       0.77      0.80      0.79       955
     Country       0.44      0.58      0.50       986
  Electronic       0.59      0.59      0.59      1009
     Hip-Hop       0.46      0.51      0.48       995
        Jazz       0.49      0.41      0.45       985
         Rap       0.48      0.38      0.42      1030
        Rock       0.51      0.64      0.57       977

    accuracy                           0.53     10000
   macro avg       0.53      0.53      0.52     10000
weighted avg       0.53      0.53      0.52     10000

Accuracy: 0.5283


### Practically the same results, mine is even a bit (0.0002) better

## Cross entropy loss function

In [15]:
from sklearn.preprocessing import LabelEncoder

my_prob = my_softmax.predict_proba(X_test_mg)
build_in_prob = build_in_softmax.predict_proba(X_test_mg)

def encode(y):
    label_encoder = LabelEncoder()
    y_int = label_encoder.fit_transform(y)
    y_one_hot = np.eye(len(np.unique(y)))[y_int]

    return y_one_hot

def cross_entropy_loss(y_true, y_prob):
    epsilon = 1e-6
    y_prob = np.clip(y_prob, epsilon, 1 - epsilon)
    # sum over classes, then mean over samples:
    return -np.mean(np.sum(y_true * np.log(y_prob), axis=1))


In [16]:
print(f'My Loss of my model: {cross_entropy_loss(encode(y_test_mg), my_prob)}')
print(f'My Loss of Build in: {cross_entropy_loss(encode(y_test_mg), build_in_prob)}')

My Loss of my model: 1.261123220400825
My Loss of Build in: 1.2610388123726235


In [17]:
from sklearn.metrics import log_loss

print(f'Build in Loss of my model: {log_loss(encode(y_test_mg), my_prob)}')
print(f'Build in Loss of Build in: {log_loss(encode(y_test_mg), build_in_prob)}')

Build in Loss of my model: 1.2614683474180308
Build in Loss of Build in: 1.2613809321758567


## The results are practically the same