# Training examples

Examples of training some models. Only sklearn and custom methods.

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
import sys
src_path = "../src/"
sys.path.append(src_path)
from training_sklearn import train_rf_grid_cv
from preprocessing import preprocess_train, preprocess_test

Read and preprocess data

In [3]:
# Read data
df_train = pd.read_csv("../data/music_genre_train.csv")
#df_test = pd.read_csv("../data/music_genre_test.csv")

In [4]:
# Preprocess data
res_dict = preprocess_train(df_train)
df_train = res_dict['dataframe']
utils_preprocessing = res_dict['utils']

In [5]:
X = df_train.drop('music_genre', axis=1)
y = df_train['music_genre']

## Random Forest

Perform grid search with cross validation for random forest classifier

In [7]:
param_grid_rf = {
        'n_estimators' : [100],
        'criterion' : ['entropy'],
        'max_depth' : [5, 10, 15],
        'min_samples_split' : [5, 20],
        'min_samples_leaf' : [5, 20]
    }

In [8]:
res_gs_cv_rf = train_rf_grid_cv(X, y, param_grid_rf)

Fitting 4 folds for each of 12 candidates, totalling 48 fits
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=  20.5s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=  22.4s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=  20.4s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=  19.3s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=20, n_estimators=100; total time=  19.1s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=20, n_estimators=100; total time=  21.6s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=20, n_estimators=100; total time=  20.8s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=20, n_estimators=100; to

In [9]:
res_gs_cv_rf

{'best_estimator': RandomForestClassifier(criterion='entropy', max_depth=10, min_samples_leaf=5,
                        min_samples_split=20),
 'best_accuracy': 0.5717767017023209,
 'best_params': {'criterion': 'entropy',
  'max_depth': 10,
  'min_samples_leaf': 5,
  'min_samples_split': 20,
  'n_estimators': 100}}

Get best model

In [10]:
model_best = res_gs_cv_rf['best_estimator']

Save model

In [13]:
from joblib import dump
dump(model_best, '../models/rf_0_572/random_forest.joblib') 

['../models/rf_0_572/random_forest.joblib']

## SVM

Now let's try SVM classifier

In [8]:
model_svc = SVC(
    kernel = 'rbf'
)

In [10]:
param_grid_svm = {
    'C' : [1, 0.999, 0.99]
}

In [11]:
svm_gs_cv = GridSearchCV(
    estimator = model_svc,
    param_grid = param_grid_svm,
    scoring = 'accuracy',
    cv = 4,
    verbose = 2
)

In [12]:
svm_gs_cv.fit(X, y)

Fitting 4 folds for each of 3 candidates, totalling 12 fits
[CV] END ................................................C=1; total time= 1.3min
[CV] END ................................................C=1; total time= 1.2min
[CV] END ................................................C=1; total time= 1.1min
[CV] END ................................................C=1; total time= 1.1min
[CV] END ............................................C=0.999; total time= 1.1min
[CV] END ............................................C=0.999; total time= 1.1min
[CV] END ............................................C=0.999; total time= 1.3min
[CV] END ............................................C=0.999; total time= 1.2min
[CV] END .............................................C=0.99; total time= 1.1min
[CV] END .............................................C=0.99; total time= 1.1min
[CV] END .............................................C=0.99; total time= 1.1min
[CV] END ........................................

GridSearchCV(cv=4, estimator=SVC(), param_grid={'C': [1, 0.999, 0.99]},
             scoring='accuracy', verbose=2)

In [13]:
model_svm = svm_gs_cv.best_estimator_
acc_svm = svm_gs_cv.best_score_
params_svm = svm_gs_cv.best_params_

In [18]:
print("acc:", acc_svm)
print("params:", params_svm)

acc: 0.5751072824059867
params: {'C': 0.999}


In [19]:
from joblib import dump
dump(model_svm, '../models/svm_0_575/svm.joblib') 

['../models/svm_0_575/svm.joblib']