In [43]:
import pandas as pd
import numpy as np

SEED = 42

df = pd.read_csv('data/music_genre.csv')
df.drop('instance_id', inplace=True, axis=1)

df

Unnamed: 0,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,BEXEY,GO GETTA,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,4-Apr,0.330,Hip-Hop
50001,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


## Clean up a bit

In [44]:
#df['artist_name'] = df['artist_name'].replace('empty_field', np.nan)
df.drop('artist_name', inplace=True, axis=1)
df.drop('track_name', inplace=True, axis=1)
df.drop('obtained_date', inplace=True, axis=1)

df.dropna(inplace=True)

df['tempo'] = df['tempo'].replace('?', np.nan)
df['duration_ms'] = df['duration_ms'].replace(-1, np.nan)
df['tempo'] = df['tempo'].apply(lambda x: float(x))
df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531,Electronic
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330,Hip-Hop
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113,Hip-Hop
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395,Hip-Hop
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354,Hip-Hop


## Divide a dataframe

In [45]:
from sklearn.model_selection import train_test_split

X = df.drop('music_genre', axis=1)
y = df['music_genre']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

## Pipelines and ColumnTransformers

In [4]:
num = list(X.select_dtypes(exclude=object).columns)
cat = list(X.select_dtypes(include=object).columns)

print(f"Numerical: {num}")
print(f"Categorical: {cat}")

Numerical: ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
Categorical: ['key', 'mode']


#### Pipeline for numerical columns

In [46]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

num_pl = Pipeline([
    ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=SEED)),
    ('scaler', StandardScaler())
])

#### ColumnTransformer for both

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pl, num),
    ('cat', OneHotEncoder(drop='if_binary'), cat)
])

NameError: name 'num' is not defined

## ML Models

In [47]:
from sklearn.metrics import accuracy_score, classification_report


def evaluate(model, xTest, yTest):
    model_y_pred = model.predict(X_test)

    print(classification_report(yTest, model_y_pred))
    print(f"Accuracy: {accuracy_score(yTest, model_y_pred)}")

#### Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression

lr_pl = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(
        random_state=SEED
    ))
])

lr_pl.fit(X_train, y_train)

NameError: name 'preprocessor' is not defined

In [9]:
evaluate(lr_pl, X_test, y_test)

              precision    recall  f1-score   support

 Alternative       0.38      0.29      0.33      1008
       Anime       0.61      0.61      0.61      1034
       Blues       0.53      0.49      0.51      1021
   Classical       0.77      0.80      0.79       955
     Country       0.44      0.58      0.50       986
  Electronic       0.59      0.59      0.59      1009
     Hip-Hop       0.46      0.51      0.48       995
        Jazz       0.49      0.41      0.45       985
         Rap       0.48      0.38      0.42      1030
        Rock       0.51      0.64      0.57       977

    accuracy                           0.53     10000
   macro avg       0.53      0.53      0.52     10000
weighted avg       0.53      0.53      0.52     10000

Accuracy: 0.5283


#### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

rfc_pl = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(
        random_state=SEED
    ))
])

rfc_pl.fit(X_train, y_train)

In [11]:
evaluate(rfc_pl, X_test, y_test)

              precision    recall  f1-score   support

 Alternative       0.41      0.33      0.37      1008
       Anime       0.78      0.73      0.76      1034
       Blues       0.59      0.55      0.57      1021
   Classical       0.82      0.85      0.84       955
     Country       0.55      0.58      0.57       986
  Electronic       0.66      0.62      0.64      1009
     Hip-Hop       0.33      0.37      0.35       995
        Jazz       0.52      0.51      0.52       985
         Rap       0.31      0.28      0.29      1030
        Rock       0.46      0.59      0.52       977

    accuracy                           0.54     10000
   macro avg       0.54      0.54      0.54     10000
weighted avg       0.54      0.54      0.54     10000

Accuracy: 0.5402


### SVM

In [12]:
from sklearn.svm import SVC

svc_pl = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(
        random_state=SEED
    ))
])

svc_pl.fit(X_train, y_train)

In [13]:
evaluate(svc_pl, X_test, y_test)

              precision    recall  f1-score   support

 Alternative       0.45      0.37      0.40      1008
       Anime       0.75      0.72      0.73      1034
       Blues       0.62      0.55      0.58      1021
   Classical       0.82      0.85      0.83       955
     Country       0.53      0.57      0.55       986
  Electronic       0.66      0.61      0.63      1009
     Hip-Hop       0.45      0.54      0.49       995
        Jazz       0.55      0.52      0.53       985
         Rap       0.47      0.36      0.41      1030
        Rock       0.51      0.72      0.60       977

    accuracy                           0.58     10000
   macro avg       0.58      0.58      0.58     10000
weighted avg       0.58      0.58      0.58     10000

Accuracy: 0.5781


## Leader board

1. **SVC** - 0.5781
2. **RandomForestClassifier** - 0.5402
3. **Logistic Regression** - 0.5283

### My implementation of Linear Regression closed form
![alt text](img.png "Regression")

In [48]:
from MyLinearRegression import MyLinearRegression

LR_TARGET = 'popularity'

X = df.drop(LR_TARGET, axis=1)
y = df[LR_TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

num = list(X.select_dtypes(exclude=object).columns)
cat = list(X.select_dtypes(include=object).columns)

print(num)
print(cat)

['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
['key', 'mode', 'music_genre']


In [49]:
preprocessor_1 = ColumnTransformer([
    ('num', num_pl, num),
    ('cat', OneHotEncoder(drop='if_binary'), cat)
])

X_train_transformed = preprocessor_1.fit_transform(X_train)
X_test_transformed = preprocessor_1.fit_transform(X_test)

mlr = MyLinearRegression()

mlr.fit(X_train_transformed, y_train)

mlr.theta_

array([ 3.73125938e+01, -1.10249642e-01,  2.06988389e-01,  1.51059300e-01,
        2.32960100e-02, -4.94329112e-01, -7.33102892e-01,  7.60173369e-01,
       -5.23952880e-01, -7.90005305e-02,  3.65149042e-03,  2.74622675e+00,
        3.05954270e+00,  3.17849917e+00,  2.94108499e+00,  3.38346731e+00,
        3.15779129e+00,  3.31168210e+00,  3.08325081e+00,  2.87248437e+00,
        3.15301551e+00,  3.04366486e+00,  3.38188394e+00,  2.88241115e-01,
        9.32418786e+00, -1.62757900e+01, -5.68244348e+00, -9.06244973e+00,
        4.86288637e+00, -2.32491605e+00,  1.77203172e+01,  6.76151876e-01,
        1.94805916e+01,  1.85940582e+01])

In [50]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = mlr.predict(X_test_transformed)

print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2: {r2_score(y_test, y_pred)}")

MSE: 91.65652857889278
R2: 0.6257656882193819


### Let's compare it to built in regression

In [51]:
from sklearn.linear_model import LinearRegression

linreg_pl = Pipeline([
    ('preprocessing', preprocessor_1),
    ('model', LinearRegression())
])

linreg_pl.fit(X_train, y_train)

In [52]:
y_pred = linreg_pl.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

MSE: 91.66281355527504
R²: 0.6257400266124299


## No measurable difference

### Cons of closed form regression:
- For huge data that don't fit in memory at once it is useless
- **X<sup>T</sup>X** can be non-invertible (formula no longer works)
- Not flexible