In [16]:
import pandas as pd
import numpy as np

SEED = 42

df = pd.read_csv('../data/music_genre.csv')
df.drop('instance_id', inplace=True, axis=1)

df

Unnamed: 0,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,BEXEY,GO GETTA,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,4-Apr,0.330,Hip-Hop
50001,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


## Clean up a bit

In [2]:
#df['artist_name'] = df['artist_name'].replace('empty_field', np.nan)
df.drop('artist_name', inplace=True, axis=1)
df.drop('track_name', inplace=True, axis=1)
df.drop('obtained_date', inplace=True, axis=1)

df.dropna(inplace=True)

df['tempo'] = df['tempo'].replace('?', np.nan)
df['duration_ms'] = df['duration_ms'].replace(-1, np.nan)
df['tempo'] = df['tempo'].apply(lambda x: float(x))
df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531,Electronic
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330,Hip-Hop
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113,Hip-Hop
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395,Hip-Hop
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354,Hip-Hop


## Divide a dataframe

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop('music_genre', axis=1)
y = df['music_genre']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=SEED
)

X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5, random_state=SEED
)

## Pipelines and ColumnTransformers

In [4]:
num = list(X.select_dtypes(exclude=object).columns)
cat = list(X.select_dtypes(include=object).columns)

print(f"Numerical: {num}")
print(f"Categorical: {cat}")

Numerical: ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
Categorical: ['key', 'mode']


#### Pipeline for numerical columns

In [5]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

num_pl = Pipeline([
    ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=SEED)),
    ('scaler', StandardScaler())
])

#### ColumnTransformer for both

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pl, num),
    ('cat', OneHotEncoder(drop='if_binary'), cat)
])

## ML Models

#### Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from utils import evaluate

lr_pl = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(
        random_state=SEED
    ))
])

lr_pl.fit(X_train, y_train)

In [8]:
print('LogisticRegression validation set\n')
evaluate(lr_pl, X_val, y_val)

LogisticRegression validation set

              precision    recall  f1-score   support

 Alternative       0.37      0.28      0.32       734
       Anime       0.61      0.63      0.62       757
       Blues       0.54      0.48      0.51       787
   Classical       0.78      0.79      0.79       743
     Country       0.46      0.59      0.52       754
  Electronic       0.59      0.57      0.58       770
     Hip-Hop       0.47      0.49      0.48       776
        Jazz       0.48      0.43      0.46       746
         Rap       0.45      0.36      0.40       710
        Rock       0.50      0.65      0.57       723

    accuracy                           0.53      7500
   macro avg       0.52      0.53      0.52      7500
weighted avg       0.53      0.53      0.52      7500

Accuracy: 0.5289333333333334


array(['Hip-Hop', 'Electronic', 'Anime', ..., 'Rap', 'Rock', 'Country'],
      shape=(7500,), dtype=object)

In [9]:
print('LogisticRegression test set\n')
evaluate(lr_pl, X_test, y_test)

LogisticRegression test set

              precision    recall  f1-score   support

 Alternative       0.41      0.34      0.37       764
       Anime       0.62      0.62      0.62       772
       Blues       0.52      0.46      0.49       758
   Classical       0.78      0.82      0.80       736
     Country       0.44      0.59      0.50       727
  Electronic       0.59      0.59      0.59       765
     Hip-Hop       0.45      0.50      0.47       716
        Jazz       0.49      0.42      0.45       737
         Rap       0.47      0.37      0.41       782
        Rock       0.52      0.63      0.57       743

    accuracy                           0.53      7500
   macro avg       0.53      0.53      0.53      7500
weighted avg       0.53      0.53      0.53      7500

Accuracy: 0.5302666666666667


array(['Rap', 'Alternative', 'Electronic', ..., 'Electronic',
       'Electronic', 'Jazz'], shape=(7500,), dtype=object)

#### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

rfc_pl = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(
        random_state=SEED
    ))
])

rfc_pl.fit(X_train, y_train)

In [11]:
print('RandomForest validation set\n')
evaluate(rfc_pl, X_val, y_val)

RandomForest validation set

              precision    recall  f1-score   support

 Alternative       0.40      0.34      0.37       734
       Anime       0.77      0.75      0.76       757
       Blues       0.60      0.54      0.57       787
   Classical       0.84      0.84      0.84       743
     Country       0.58      0.58      0.58       754
  Electronic       0.68      0.63      0.66       770
     Hip-Hop       0.39      0.39      0.39       776
        Jazz       0.53      0.54      0.54       746
         Rap       0.32      0.33      0.32       710
        Rock       0.47      0.62      0.54       723

    accuracy                           0.56      7500
   macro avg       0.56      0.56      0.56      7500
weighted avg       0.56      0.56      0.56      7500

Accuracy: 0.5572


array(['Hip-Hop', 'Country', 'Anime', ..., 'Rap', 'Rock', 'Alternative'],
      shape=(7500,), dtype=object)

In [12]:
print('RandomForest test set\n')
evaluate(rfc_pl, X_test, y_test)

RandomForest test set

              precision    recall  f1-score   support

 Alternative       0.43      0.36      0.39       764
       Anime       0.78      0.75      0.76       772
       Blues       0.59      0.53      0.56       758
   Classical       0.83      0.87      0.85       736
     Country       0.55      0.57      0.56       727
  Electronic       0.64      0.59      0.62       765
     Hip-Hop       0.34      0.38      0.36       716
        Jazz       0.53      0.52      0.52       737
         Rap       0.34      0.32      0.33       782
        Rock       0.47      0.59      0.52       743

    accuracy                           0.55      7500
   macro avg       0.55      0.55      0.55      7500
weighted avg       0.55      0.55      0.55      7500

Accuracy: 0.5474666666666667


array(['Rap', 'Rock', 'Anime', ..., 'Alternative', 'Electronic',
       'Classical'], shape=(7500,), dtype=object)

### SVM

In [13]:
from sklearn.svm import SVC

svc_pl = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(
        random_state=SEED
    ))
])

svc_pl.fit(X_train, y_train)

In [14]:
print('SVC validation set\n')
evaluate(svc_pl, X_val, y_val)

SVC validation set

              precision    recall  f1-score   support

 Alternative       0.43      0.35      0.39       734
       Anime       0.72      0.73      0.73       757
       Blues       0.63      0.55      0.58       787
   Classical       0.84      0.85      0.85       743
     Country       0.55      0.56      0.55       754
  Electronic       0.65      0.61      0.63       770
     Hip-Hop       0.47      0.53      0.50       776
        Jazz       0.55      0.52      0.54       746
         Rap       0.44      0.36      0.39       710
        Rock       0.50      0.72      0.59       723

    accuracy                           0.58      7500
   macro avg       0.58      0.58      0.57      7500
weighted avg       0.58      0.58      0.58      7500

Accuracy: 0.5789333333333333


array(['Hip-Hop', 'Hip-Hop', 'Anime', ..., 'Hip-Hop', 'Rock',
       'Alternative'], shape=(7500,), dtype=object)

In [15]:
print('SVC test set\n')
evaluate(svc_pl, X_test, y_test)

SVC test set

              precision    recall  f1-score   support

 Alternative       0.45      0.39      0.42       764
       Anime       0.76      0.72      0.74       772
       Blues       0.61      0.53      0.57       758
   Classical       0.82      0.86      0.84       736
     Country       0.52      0.57      0.55       727
  Electronic       0.67      0.60      0.63       765
     Hip-Hop       0.44      0.52      0.47       716
        Jazz       0.55      0.50      0.53       737
         Rap       0.48      0.39      0.43       782
        Rock       0.51      0.69      0.59       743

    accuracy                           0.58      7500
   macro avg       0.58      0.58      0.57      7500
weighted avg       0.58      0.58      0.57      7500

Accuracy: 0.5762666666666667


array(['Rap', 'Rock', 'Anime', ..., 'Electronic', 'Electronic',
       'Classical'], shape=(7500,), dtype=object)

## Leader board

| Model               |   Val Accuracy    |   Test Accuracy    |
|:--------------------|:-----------------:|:------------------:|
| Logistic Regression | 0.5289333333333334 | 0.5302666666666667 |
| Random Forest       | 0.5572 | 0.5474666666666667 |
| SVC                 | 0.5789333333333333 | 0.5762666666666667 |

1. **SVC** - 0.5762
2. **RandomForestClassifier** - 0.5474
3. **Logistic Regression** - 0.5302