[dataset](https://www.kaggle.com/rashikrahmanpritom/177k-english-song-data-from-20082017)

In [70]:
import pandas as pd

data = pd.read_csv('data/songs.csv')
data.head()

Unnamed: 0,track_id,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
0,135,256000,1,,2008-11-26 01:43:26,2008-11-26 00:00:00,837,0,Rock,"[45, 58]",...,,2484,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1832,,0,,[],Father's Day
1,136,256000,1,,2008-11-26 01:43:35,2008-11-26 00:00:00,509,0,Rock,"[45, 58]",...,,1948,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1498,,0,,[],Peel Back The Mountain Sky
2,151,192000,0,,2008-11-26 01:44:55,,192,0,Rock,[25],...,,701,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,148,,4,,[],Untitled 04
3,152,192000,0,,2008-11-26 01:44:58,,193,0,Rock,[25],...,,637,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,98,,11,,[],Untitled 11
4,153,256000,0,Arc and Sender,2008-11-26 01:45:00,2008-11-26 00:00:00,405,5,Rock,[26],...,,354,en,Attribution-NonCommercial-NoDerivatives (aka M...,424,,2,,[],Hundred-Year Flood


task: binary classification between rock and hip-pop songs

In [73]:
data.isna().sum()

track_id             0
bit_rate             0
comments             0
composer         17568
date_created         0
date_recorded    15836
duration             0
favorites            0
genre_top            0
genres               0
genres_all           0
information      17252
interest             0
language_code    13645
license             20
listens              0
lyricist         17681
number               0
publisher        17682
tags                 0
title                0
dtype: int64

feature report:
- numerical:
    * track_id: 0
    * bit_rate: mean
    * comments: 0
    * duration: mean
    * favorites: 0
    * interest: 0
    * listens: 0
- categorical:
    * composer: 'missing'
    * date_created: 'missing'
    * date_recorded: 'missing'
    * genre_top: TO CLASSIFY
    * information: 'missing'
    * language_code: 'missing'
    * license: 'missing'
    * lyricist: 'missing'
    * publisher: 'missing'
    * tags: 'missing'
    * title: 'missing'

In [22]:
data.keys()
https://www.kaggle.com/rashikrahmanpritom/177k-english-song-data-from-20082017

Index(['track_id', 'bit_rate', 'comments', 'composer', 'date_created',
       'date_recorded', 'duration', 'favorites', 'genre_top', 'genres',
       'genres_all', 'information', 'interest', 'language_code', 'license',
       'listens', 'lyricist', 'number', 'publisher', 'tags', 'title'],
      dtype='object')

In [71]:
# clean data importing
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# classification importing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# if this data is missing its value will be zero
zero_value_features = ['track_id', 'comments', 'favorites', 'interest', 'listens']
zero_value_transformer = Pipeline(steps = [
    ('zero_imputer', SimpleImputer(strategy = 'constant', fill_value = 0))
])

# if this data is missing its value will be the mean
mean_value_features = ['bit_rate', 'duration']
mean_value_transformer = Pipeline(steps = [
    ('mean_imputer', SimpleImputer(strategy = 'mean'))
])

# if this data is missing their value will be 'missing'
missing_value_features =  ['composer', 'date_created', 'date_recorded',
                           'information', 'language_code', 'license', 'lyricist', 'publisher', 'tags', 'title']
missing_value_transformer = Pipeline(steps = [
    ('missing_imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])
preprocessor = ColumnTransformer(
    transformers = [
    # ('numerical', numerical_transformer, num_features)
    ('zero_value', zero_value_transformer, zero_value_features),
    ('mean_value', mean_value_transformer, mean_value_features),
    ('missing_value', missing_value_transformer ,missing_value_features)
], remainder = 'passthrough', verbose = 2)

model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

# split data 
# genre_top is dropped because is the column to predict,
# the others because are in a very strange format
X = data.drop(['genre_top', 'number', 'genres_all', 'genres'],axis = 1)
y = data.genre_top
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .15)

model.fit(X_train, y_train);

[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s


In [81]:
model.score(X_test, y_test) * 100

91.99549041713641

In [79]:
from sklearn.model_selection import GridSearchCV

# using the grid with the pipeline
pipe_grid = {
    'preprocessor__mean_value__mean_imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 200],
    'model__min_samples_split': [2, 4],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['auto', 'sqrt']
}

gs_model = GridSearchCV(model, pipe_grid, cv = 5, verbose = 2)
gs_model.fit(X_train, y_train);

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.3s
[CV] END model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=  22.2s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=  25.1s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=

[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=  16.4s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=  15.8s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocess

[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=   1.6s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=   1.7s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocess

[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.5s
[CV] END model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=   3.1s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=   2.4s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.3s
[CV] END model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocess

[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=  22.0s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=  26.4s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.3s
[CV] END model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocess

[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.4s
[CV] END model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=mean; total time=  29.6s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.3s
[CV] END model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=median; total time=  33.0s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.4s
[CV] END model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preproce

[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=median; total time=   2.3s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.3s
[CV] END model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=median; total time=   2.4s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, prepro

[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=median; total time=   1.6s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocessor__mean_value__mean_imputer__strategy=median; total time=   1.6s
[ColumnTransformer] .... (1 of 3) Processing zero_value, total=   0.0s
[ColumnTransformer] .... (2 of 3) Processing mean_value, total=   0.0s
[ColumnTransformer] . (3 of 3) Processing missing_value, total=   0.2s
[CV] END model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, prepro

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('zero_value',
                                                                         Pipeline(steps=[('zero_imputer',
                                                                                          SimpleImputer(fill_value=0,
                                                                                                        strategy='constant'))]),
                                                                         ['track_id',
                                                                          'comments',
                                                                          'favorites',
                                                                          'interest',
                                                         

In [82]:
gs_model.score(X_test, y_test) * 100

91.8075911311537