In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from pyod.models.knn import KNN 

from sklearn.feature_selection import RFECV, SequentialFeatureSelector
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import(
    RandomForestClassifier,
    HistGradientBoostingClassifier
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from scripts.wrapped import Wrapped

from scripts.train import(
    cross_validate_balancead,
    permutation,
    combination
)

In [2]:
wp = Wrapped(
    '../data/row/',
    '../data/processed/',
    '../data/files/'
)

# Importando dados para treinamento

In [4]:
df = wp.load_data('df_instrumentos_features_selecionadas')
df

Unnamed: 0,tonnetz0,tonnetz1,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,mfcc0,...,spectral_band4,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,instrumento,file_name
0,0.010380,0.038218,0.513887,0.581963,0.434641,0.293949,0.317826,0.271148,0.548258,-24.170067,...,3409.640722,2600.055158,1.098243,5260.821321,-44.640682,-30.202150,0.149672,33095,accordion,000139_119040.ogg
1,0.084643,-0.035337,0.175587,0.429345,0.840043,0.384524,0.282545,0.346341,0.482365,-90.556145,...,3516.595715,2385.271420,1.360742,5225.124166,-46.575737,-59.323132,0.112181,24785,accordion,000145_172800.ogg
2,-0.030127,0.082404,0.453814,0.219808,0.228484,0.446050,0.178916,0.250981,0.327460,-106.124260,...,3256.491175,2069.850731,1.343197,4287.155824,-56.276707,-71.512909,0.096007,21230,accordion,000201_168960.ogg
3,0.157274,-0.066099,0.209729,0.338755,0.660264,0.256625,0.275209,0.180417,0.399448,-66.319801,...,3855.791675,2903.979876,1.218963,6751.133809,-42.723160,-44.649315,0.144010,31836,accordion,000212_211200.ogg
4,-0.049096,-0.029067,0.484400,0.334572,0.441629,0.679485,0.382958,0.326033,0.205775,-226.263824,...,2883.225952,1251.374287,1.339527,2263.034730,-62.227947,-78.196365,0.059804,13214,accordion,000640_49920.ogg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8471,0.070358,0.003048,0.332754,0.359639,0.473210,0.512169,0.607755,0.500391,0.378782,-87.742729,...,3664.812127,3194.922838,1.253166,6714.437434,-53.587788,-76.614616,0.134979,29855,voice,153341_184320.ogg
8472,0.006115,-0.030219,0.161793,0.190661,0.249882,0.378161,0.461520,0.332309,0.294171,-179.792145,...,2879.597305,1351.326334,1.450944,2412.643029,-52.071827,-76.029976,0.061211,13533,voice,153412_134400.ogg
8473,-0.008339,0.005653,0.589552,0.309548,0.187583,0.169132,0.240637,0.332375,0.333517,-153.614334,...,3274.875402,2685.831528,1.274003,4614.225695,-66.120743,-80.000000,0.157649,34869,voice,153478_30720.ogg
8474,0.108185,0.025366,0.615495,0.541314,0.467334,0.611199,0.791296,0.678383,0.508162,-80.250626,...,3376.494191,1428.726581,1.081204,3358.205447,-30.260586,-13.068514,0.032283,7136,voice,153986_203520.ogg


- Definindo as variaveis de treino e o target

In [5]:
le = LabelEncoder()
df['labels'] = le.fit_transform(df.instrumento)

In [6]:
X = df.drop(columns=['labels', 'instrumento','file_name'])
y = df['labels'].to_frame()

# TESTE 1: Balanceamento

- 1° Opção: Balanceamento das classes com SMOTH

In [8]:
t1_models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    MLPClassifier(),
    SVC(),
])

t1_acuracy_models = [cross_validate_balancead(k=5, model=model, X=X, y=y, oversampling=True) for model in t1_models]

dict_results_t1 = {
    'Naive Bayes': t1_acuracy_models[0],
    'KNN': t1_acuracy_models[1],
    'Arvore de Decisão': t1_acuracy_models[2],
    'Floresta Aleatoria': t1_acuracy_models[3],
    'HistGradientBoosting': t1_acuracy_models[4],
    'LIGHTGBM': t1_acuracy_models[5],
    'MLP': t1_acuracy_models[6],
    'SVC': t1_acuracy_models[7],
}

dict_results_t1

{'Naive Bayes': 0.2533052262481216,
 'KNN': 0.12093031669171257,
 'Arvore de Decisão': 0.16328672009795736,
 'Floresta Aleatoria': 0.3128842461178828,
 'HistGradientBoosting': 0.33294094729225804,
 'LIGHTGBM': 0.3363619413368954,
 'MLP': 0.14724056603773586,
 'SVC': 0.1593918016363333}

- 2° Opção: Class Weight

In [43]:
models_class_weight = np.array([
    DecisionTreeClassifier(class_weight='balanced'), 
    RandomForestClassifier(class_weight='balanced', random_state = 0, n_jobs = -1), 
    LGBMClassifier(class_weight='balanced'),
])

t1_acuracy_models_class_weight = [cross_validate_balancead(k=5, model=model, X=X, y=y, weight=True) for model in models_class_weight]

dict_results_t1_class_weight = {
    'Arvore de Decisão': t1_acuracy_models_class_weight[0],
    'Floresta Aleatoria': t1_acuracy_models_class_weight[1],
    'LIGHTGBM': t1_acuracy_models_class_weight[2],
}

dict_results_t1_class_weight

Acuracia do modelo DecisionTreeClassifier(class_weight='balanced') do Fold 0: 0.16037735849056603
Acuracia do modelo DecisionTreeClassifier(class_weight='balanced') do Fold 1: 0.16342182890855458
Acuracia do modelo DecisionTreeClassifier(class_weight='balanced') do Fold 2: 0.15457227138643068
Acuracia do modelo DecisionTreeClassifier(class_weight='balanced') do Fold 3: 0.18289085545722714
Acuracia do modelo DecisionTreeClassifier(class_weight='balanced') do Fold 4: 0.1752212389380531
Acuracia do modelo RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=0) do Fold 0: 0.3107311320754717
Acuracia do modelo RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=0) do Fold 1: 0.31799410029498526
Acuracia do modelo RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=0) do Fold 2: 0.328023598820059
Acuracia do modelo RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=0) do Fold 3: 0.31976401179941005
Acuracia do mo

{'Arvore de Decisão': 0.1672967106361663,
 'Floresta Aleatoria': 0.32291318806701175,
 'LIGHTGBM': 0.3402549117827127}

# TESTE 2: Remover Outlier

- Treinar modelo de classificação de outilier

In [10]:
detector_outilier = KNN()

new_df = df.drop(columns=['instrumento','file_name'])

detector_outilier.fit(new_df)

KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
  radius=1.0)

- Checar numeros de outilers e não outilers<br/>
**0 (False) Não é outiler**<br/>
**1 (True) É outiler**

In [11]:
previsions = detector_outilier.labels_
np.unique(previsions, return_counts=True)

(array([0, 1]), array([7628,  848]))

- Distancia euclidiana dos registros

In [12]:
confiance_previsions = detector_outilier.decision_scores_
confiance_previsions

array([469.2569029 , 263.78344097, 234.86926869, ..., 547.08483006,
       501.47903332, 247.95356846])

- Checar os outilers

In [13]:
# pegando indice dos registros que são outilers
outilers_id =[previsions[i] == 1 for i in range(previsions.shape[0])]
outilers = df.iloc[outilers_id,:]
outilers

Unnamed: 0,tonnetz0,tonnetz1,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,mfcc0,...,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,instrumento,file_name,labels
28,0.177880,0.099179,0.446227,0.186154,0.310970,0.289374,0.480033,0.271526,0.262448,-113.129501,...,3178.625677,1.504104,5658.636007,-64.840324,-79.991013,0.225914,49936,accordion,011547_157440.ogg,0
46,0.001389,0.058207,0.307783,0.359920,0.321503,0.340189,0.230577,0.230887,0.267941,-335.472443,...,3317.306522,1.480623,6072.413242,-37.332355,-79.453758,0.249911,55267,accordion,019554_199680.ogg,0
78,0.092169,0.133666,0.193358,0.281352,0.774958,0.293572,0.316613,0.302946,0.372028,-121.544807,...,2105.938025,1.162251,3689.097337,-49.523079,-69.013954,0.146228,32336,accordion,028377_42240.ogg,0
94,0.312066,0.246917,0.430660,0.239950,0.289489,0.156493,0.338560,0.262607,0.266463,-181.171143,...,1835.825453,0.991421,3336.597303,-63.885098,-79.566399,0.125387,27748,accordion,032445_253440.ogg,0
101,-0.083369,0.002492,0.365614,0.188457,0.210712,0.340728,0.218881,0.389993,0.138017,-118.916946,...,2495.492458,1.521361,4334.344015,-64.275818,-79.999847,0.203175,44935,accordion,035590_49920.ogg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8408,0.011938,0.006710,0.223368,0.229950,0.303748,0.293760,0.263125,0.406126,0.506930,-5.747187,...,3037.057890,1.260896,7336.652079,-53.212646,-69.924911,0.133118,29465,voice,128693_729600.ogg,19
8435,0.074432,-0.104148,0.265711,0.153765,0.160796,0.181671,0.145868,0.225688,0.512277,-100.646721,...,2272.639669,1.200361,3904.104621,-45.581455,-76.195732,0.212792,47053,voice,142108_149760.ogg,19
8453,-0.198840,-0.026428,0.484748,0.558405,0.458368,0.481970,0.493589,0.498251,0.470169,-93.289604,...,1810.905441,1.761557,4142.593589,-45.505581,-29.093159,0.037906,8383,voice,147088_72960.ogg,19
8455,-0.040879,0.077836,0.445147,0.216153,0.162184,0.219060,0.224821,0.265225,0.208129,-224.087448,...,2759.989165,1.015649,6755.730224,-27.749546,-59.993065,0.164289,36301,voice,147642_53760.ogg,19


In [14]:
outilers.instrumento.value_counts()

flute                128
mallet_percussion    127
cymbals               85
synthesizer           84
violin                59
drums                 52
organ                 47
bass                  40
accordion             36
guitar                34
voice                 29
ukulele               21
banjo                 19
piano                 18
mandolin              16
clarinet              15
trombone              12
saxophone             11
trumpet                8
cello                  7
Name: instrumento, dtype: int64

- Agora vamos remover os outilers da nossa base

In [15]:
# pegando indice dos registros que não são outilers
outilers_id =[previsions[i] == 0 for i in range(previsions.shape[0])]
df_train = df.iloc[outilers_id,:]
df_train

Unnamed: 0,tonnetz0,tonnetz1,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,mfcc0,...,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,instrumento,file_name,labels
0,0.010380,0.038218,0.513887,0.581963,0.434641,0.293949,0.317826,0.271148,0.548258,-24.170067,...,2600.055158,1.098243,5260.821321,-44.640682,-30.202150,0.149672,33095,accordion,000139_119040.ogg,0
1,0.084643,-0.035337,0.175587,0.429345,0.840043,0.384524,0.282545,0.346341,0.482365,-90.556145,...,2385.271420,1.360742,5225.124166,-46.575737,-59.323132,0.112181,24785,accordion,000145_172800.ogg,0
2,-0.030127,0.082404,0.453814,0.219808,0.228484,0.446050,0.178916,0.250981,0.327460,-106.124260,...,2069.850731,1.343197,4287.155824,-56.276707,-71.512909,0.096007,21230,accordion,000201_168960.ogg,0
3,0.157274,-0.066099,0.209729,0.338755,0.660264,0.256625,0.275209,0.180417,0.399448,-66.319801,...,2903.979876,1.218963,6751.133809,-42.723160,-44.649315,0.144010,31836,accordion,000212_211200.ogg,0
4,-0.049096,-0.029067,0.484400,0.334572,0.441629,0.679485,0.382958,0.326033,0.205775,-226.263824,...,1251.374287,1.339527,2263.034730,-62.227947,-78.196365,0.059804,13214,accordion,000640_49920.ogg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8470,0.049317,0.016959,0.431032,0.433562,0.439329,0.472623,0.436766,0.458280,0.480054,2.422542,...,2542.887346,1.234712,5289.648834,-53.747391,-70.919098,0.141731,31319,voice,151680_126720.ogg,19
8472,0.006115,-0.030219,0.161793,0.190661,0.249882,0.378161,0.461520,0.332309,0.294171,-179.792145,...,1351.326334,1.450944,2412.643029,-52.071827,-76.029976,0.061211,13533,voice,153412_134400.ogg,19
8473,-0.008339,0.005653,0.589552,0.309548,0.187583,0.169132,0.240637,0.332375,0.333517,-153.614334,...,2685.831528,1.274003,4614.225695,-66.120743,-80.000000,0.157649,34869,voice,153478_30720.ogg,19
8474,0.108185,0.025366,0.615495,0.541314,0.467334,0.611199,0.791296,0.678383,0.508162,-80.250626,...,1428.726581,1.081204,3358.205447,-30.260586,-13.068514,0.032283,7136,voice,153986_203520.ogg,19


- Treinar modelos sem outlier e balanceameto

In [17]:
X_rm_outilers = df_train.drop(columns=['labels', 'instrumento','file_name'])
y_rm_outilers = df_train['labels'].to_frame()

models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    # XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t2_acuracy_models = [cross_validate_balancead(k=5, model=model, X=X_rm_outilers,  y=y_rm_outilers) for model in models]

dict_results_t2 = {
    'Naive Bayes': t2_acuracy_models[0],
    'KNN': t2_acuracy_models[1],
    'Arvore de Decisão': t2_acuracy_models[2],
    'Floresta Aleatoria': t2_acuracy_models[3],
    'HistGradientBoosting': t2_acuracy_models[4],
    'LIGHTGBM': t2_acuracy_models[5],
    # 'XGB': t2_acuracy_models[6],
    'MLP': t2_acuracy_models[6],
    'SVC': t2_acuracy_models[7],
}

dict_results_t2

Acuracia do modelo GaussianNB() do Fold 0: 0.2516382699868938
Acuracia do modelo GaussianNB() do Fold 1: 0.2706422018348624
Acuracia do modelo GaussianNB() do Fold 2: 0.264089121887287
Acuracia do modelo GaussianNB() do Fold 3: 0.26295081967213113
Acuracia do modelo GaussianNB() do Fold 4: 0.2701639344262295
Acuracia do modelo KNeighborsClassifier() do Fold 0: 0.1251638269986894
Acuracia do modelo KNeighborsClassifier() do Fold 1: 0.13958060288335516
Acuracia do modelo KNeighborsClassifier() do Fold 2: 0.145478374836173
Acuracia do modelo KNeighborsClassifier() do Fold 3: 0.13377049180327868
Acuracia do modelo KNeighborsClassifier() do Fold 4: 0.15737704918032788
Acuracia do modelo DecisionTreeClassifier() do Fold 0: 0.1690694626474443
Acuracia do modelo DecisionTreeClassifier() do Fold 1: 0.18479685452162517
Acuracia do modelo DecisionTreeClassifier() do Fold 2: 0.18086500655307994
Acuracia do modelo DecisionTreeClassifier() do Fold 3: 0.18950819672131147
Acuracia do modelo DecisionTr

{'Naive Bayes': 0.26389686956148073,
 'KNN': 0.14027406914036483,
 'Arvore de Decisão': 0.1792085598263971,
 'Floresta Aleatoria': 0.3227585673463249,
 'HistGradientBoosting': 0.34570328513417703,
 'LIGHTGBM': 0.334687751111875,
 'MLP': 0.1460482564510238,
 'SVC': 0.1887782910426917}

# TESTE 3: Features de Maior Correlação

In [18]:
df.corr()

Unnamed: 0,tonnetz0,tonnetz1,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,mfcc0,...,spectral_band3,spectral_band4,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,labels
tonnetz0,1.0,0.06214,-0.097181,-0.170037,0.169987,-0.225499,0.160613,-0.176057,-0.113787,-0.029714,...,-0.060755,-0.058103,-0.061619,-0.092151,-0.058551,-0.062679,-0.076992,-0.051588,-0.051571,-0.031576
tonnetz1,0.06214,1.0,0.242571,-0.165899,0.098279,-0.093905,-0.187621,0.059018,-0.24273,-0.062356,...,-0.074494,-0.069979,-0.076872,-0.062285,-0.073258,-0.010867,-0.012354,-0.065814,-0.065818,-0.002206
chroma1,-0.097181,0.242571,1.0,0.494096,0.107322,0.11776,0.108515,0.168467,0.075469,0.146132,...,0.119271,0.104452,0.108212,0.097569,0.130602,0.182205,0.227982,0.069222,0.069246,-0.045024
chroma2,-0.170037,-0.165899,0.494096,1.0,0.499016,0.331268,0.141218,0.179724,0.254288,0.186089,...,0.16617,0.148799,0.157707,0.126737,0.179851,0.19846,0.256889,0.099693,0.099708,-0.050132
chroma3,0.169987,0.098279,0.107322,0.499016,1.0,0.532327,0.132038,0.105914,0.144183,0.172337,...,0.145982,0.130206,0.133901,0.083847,0.157737,0.14519,0.19882,0.07473,0.07474,-0.071296
chroma4,-0.225499,-0.093905,0.11776,0.331268,0.532327,1.0,0.499493,0.303442,0.216477,0.18172,...,0.165171,0.1475,0.143578,0.117159,0.170458,0.200901,0.268462,0.071936,0.071957,-0.052828
chroma5,0.160613,-0.187621,0.108515,0.141218,0.132038,0.499493,1.0,0.550665,0.225242,0.209649,...,0.143825,0.127655,0.121618,0.100378,0.150782,0.201781,0.257689,0.060857,0.06087,-0.065184
chroma6,-0.176057,0.059018,0.168467,0.179724,0.105914,0.303442,0.550665,1.0,0.519934,0.18526,...,0.156626,0.143323,0.117864,0.121308,0.150682,0.239641,0.304158,0.044637,0.044646,-0.054509
chroma7,-0.113787,-0.24273,0.075469,0.254288,0.144183,0.216477,0.225242,0.519934,1.0,0.198951,...,0.188767,0.169222,0.163844,0.133441,0.19632,0.20953,0.283768,0.094836,0.094854,-0.054739
mfcc0,-0.029714,-0.062356,0.146132,0.186089,0.172337,0.18172,0.209649,0.18526,0.198951,1.0,...,0.504851,0.459083,0.498269,0.236631,0.546795,0.28835,0.265571,0.318793,0.318687,0.007359


In [19]:
df_corr = df.corr().unstack().reset_index().dropna()
df_corr.rename(columns = {'level_0': 'features_a', 'level_1': 'features_b', 0:'correlacao'}, inplace = True)
df_corr

Unnamed: 0,features_a,features_b,correlacao
0,tonnetz0,tonnetz0,1.000000
1,tonnetz0,tonnetz1,0.062140
2,tonnetz0,chroma1,-0.097181
3,tonnetz0,chroma2,-0.170037
4,tonnetz0,chroma3,0.169987
...,...,...,...
2020,labels,spectogram,-0.001475
2021,labels,mel_spectogram,0.003394
2022,labels,zero_crossing_rate,0.007464
2023,labels,zero_crossing,0.007479


- Correlação Positia

In [20]:
corr_posit = df_corr.query('correlacao > 0.7 and correlacao < 1.0')
corr_posit

Unnamed: 0,features_a,features_b,correlacao
1565,spectral_band2,spectral_band3,0.977866
1566,spectral_band2,spectral_band4,0.925967
1567,spectral_band2,spectral_centroid,0.864482
1569,spectral_band2,spectral_rolloff,0.927909
1609,spectral_band3,spectral_band2,0.977866
1611,spectral_band3,spectral_band4,0.984122
1612,spectral_band3,spectral_centroid,0.77182
1614,spectral_band3,spectral_rolloff,0.837256
1654,spectral_band4,spectral_band2,0.925967
1655,spectral_band4,spectral_band3,0.984122


In [44]:
corr_posit.features_a.unique()

array(['spectral_band2', 'spectral_band3', 'spectral_band4',
       'spectral_centroid', 'spectral_rolloff', 'spectogram',
       'mel_spectogram', 'zero_crossing_rate', 'zero_crossing'],
      dtype=object)

In [22]:
X =  df[corr_posit.features_a.unique()]
y = df['labels'].to_frame()

models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    # XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t3_acuracy_models_corr_posit = [cross_validate_balancead(k=5, model=model, X=X, y=y) for model in models]

dict_results_t3_corr_posit = {
    'Naive Bayes': t3_acuracy_models_corr_posit[0],
    'KNN': t3_acuracy_models_corr_posit[1],
    'Arvore de Decisão': t3_acuracy_models_corr_posit[2],
    'Floresta Aleatoria': t3_acuracy_models_corr_posit[3],
    'HistGradientBoosting': t3_acuracy_models_corr_posit[4],
    'LIGHTGBM': t3_acuracy_models_corr_posit[5],
    # 'XGB': t3_acuracy_models_corr_posit[6],
    'MLP': t3_acuracy_models_corr_posit[6],
    'SVC': t3_acuracy_models_corr_posit[7],
}

dict_results_t3_corr_posit

Acuracia do modelo GaussianNB() do Fold 0: 0.17275943396226415
Acuracia do modelo GaussianNB() do Fold 1: 0.17345132743362832
Acuracia do modelo GaussianNB() do Fold 2: 0.18525073746312684
Acuracia do modelo GaussianNB() do Fold 3: 0.16519174041297935
Acuracia do modelo GaussianNB() do Fold 4: 0.17994100294985252
Acuracia do modelo KNeighborsClassifier() do Fold 0: 0.13266509433962265
Acuracia do modelo KNeighborsClassifier() do Fold 1: 0.14454277286135694
Acuracia do modelo KNeighborsClassifier() do Fold 2: 0.1480825958702065
Acuracia do modelo KNeighborsClassifier() do Fold 3: 0.12861356932153392
Acuracia do modelo KNeighborsClassifier() do Fold 4: 0.1504424778761062
Acuracia do modelo DecisionTreeClassifier() do Fold 0: 0.1303066037735849
Acuracia do modelo DecisionTreeClassifier() do Fold 1: 0.12743362831858407
Acuracia do modelo DecisionTreeClassifier() do Fold 2: 0.13215339233038348
Acuracia do modelo DecisionTreeClassifier() do Fold 3: 0.11622418879056047
Acuracia do modelo Deci

{'Naive Bayes': 0.17531884844437023,
 'KNN': 0.14086930205376524,
 'Arvore de Decisão': 0.12812621750987924,
 'Floresta Aleatoria': 0.19254529136750712,
 'HistGradientBoosting': 0.19407831023543162,
 'LIGHTGBM': 0.19738179718372573,
 'MLP': 0.10429593142984361,
 'SVC': 0.1813364084154283}

- Correlação Negativa

In [23]:
corr_negat = df_corr.query('correlacao < 0 and correlacao < -0.66')
corr_negat

Unnamed: 0,features_a,features_b,correlacao
484,mfcc1,spectral_band2,-0.843059
485,mfcc1,spectral_band3,-0.796348
486,mfcc1,spectral_band4,-0.725181
487,mfcc1,spectral_centroid,-0.872413
489,mfcc1,spectral_rolloff,-0.856605
492,mfcc1,zero_crossing_rate,-0.663971
493,mfcc1,zero_crossing,-0.664032
1540,spectral_band2,mfcc1,-0.843059
1585,spectral_band3,mfcc1,-0.796348
1630,spectral_band4,mfcc1,-0.725181


In [25]:
X =  df[corr_negat.features_b.unique()]
y = df['labels'].to_frame()

models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t3_acuracy_models_corr_neg = [cross_validate_balancead(k=5, model=model, X=X, y=y) for model in models]

dict_results_t3_corr_neg = {
    'Naive Bayes': t3_acuracy_models_corr_neg[0],
    'KNN': t3_acuracy_models_corr_neg[1],
    'Arvore de Decisão': t3_acuracy_models_corr_neg[2],
    'Floresta Aleatoria': t3_acuracy_models_corr_neg[3],
    'HistGradientBoosting': t3_acuracy_models_corr_neg[4],
    'LIGHTGBM': t3_acuracy_models_corr_neg[5],
    # 'XGB': t3_acuracy_models_corr_neg[6],;
    'MLP': t3_acuracy_models_corr_neg[6],
    'SVC': t3_acuracy_models_corr_neg[7],
}

dict_results_t3_corr_neg

Acuracia do modelo GaussianNB() do Fold 0: 0.15625
Acuracia do modelo GaussianNB() do Fold 1: 0.15988200589970503
Acuracia do modelo GaussianNB() do Fold 2: 0.17581120943952802
Acuracia do modelo GaussianNB() do Fold 3: 0.15693215339233038
Acuracia do modelo GaussianNB() do Fold 4: 0.15693215339233038
Acuracia do modelo KNeighborsClassifier() do Fold 0: 0.13266509433962265
Acuracia do modelo KNeighborsClassifier() do Fold 1: 0.14454277286135694
Acuracia do modelo KNeighborsClassifier() do Fold 2: 0.14690265486725665
Acuracia do modelo KNeighborsClassifier() do Fold 3: 0.13038348082595871
Acuracia do modelo KNeighborsClassifier() do Fold 4: 0.15221238938053097
Acuracia do modelo DecisionTreeClassifier() do Fold 0: 0.14150943396226415
Acuracia do modelo DecisionTreeClassifier() do Fold 1: 0.13097345132743363
Acuracia do modelo DecisionTreeClassifier() do Fold 2: 0.13215339233038348
Acuracia do modelo DecisionTreeClassifier() do Fold 3: 0.13451327433628318
Acuracia do modelo DecisionTreeC

{'Naive Bayes': 0.16116150442477875,
 'KNN': 0.14134127845494518,
 'Arvore de Decisão': 0.13614849446206936,
 'Floresta Aleatoria': 0.1956129292591974,
 'HistGradientBoosting': 0.19938776646073358,
 'LIGHTGBM': 0.20139415316970002,
 'MLP': 0.10901228641398117,
 'SVC': 0.18192637891690322}

- Se combinarmos as duas abordagens ?

In [27]:
features = np.append(corr_posit.features_b.values, corr_negat.features_b.values)
features = np.unique(features)

X = df[features]
y = df['labels'].to_frame()

models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t3_combine_corr = [cross_validate_balancead(k=5, model=model, X=X, y=y) for model in models]

dict_results_t3_cb = {
    'Naive Bayes': t3_combine_corr[0],
    'KNN': t3_combine_corr[1],
    'Arvore de Decisão': t3_combine_corr[2],
    'Floresta Aleatoria': t3_combine_corr[3],
    'HistGradientBoosting': t3_combine_corr[4],
    'LIGHTGBM': t3_combine_corr[5],
    'XGB': t3_combine_corr[6],
    'MLP': t3_combine_corr[7],
    'SVC': t3_combine_corr[8],
}

dict_results_t3_cb

Acuracia do modelo GaussianNB() do Fold 0: 0.1733490566037736
Acuracia do modelo GaussianNB() do Fold 1: 0.16932153392330385
Acuracia do modelo GaussianNB() do Fold 2: 0.18761061946902655
Acuracia do modelo GaussianNB() do Fold 3: 0.17168141592920355
Acuracia do modelo GaussianNB() do Fold 4: 0.1775811209439528
Acuracia do modelo KNeighborsClassifier() do Fold 0: 0.13089622641509435
Acuracia do modelo KNeighborsClassifier() do Fold 1: 0.14454277286135694
Acuracia do modelo KNeighborsClassifier() do Fold 2: 0.1480825958702065
Acuracia do modelo KNeighborsClassifier() do Fold 3: 0.12920353982300886
Acuracia do modelo KNeighborsClassifier() do Fold 4: 0.15103244837758112
Acuracia do modelo DecisionTreeClassifier() do Fold 0: 0.1338443396226415
Acuracia do modelo DecisionTreeClassifier() do Fold 1: 0.14513274336283186
Acuracia do modelo DecisionTreeClassifier() do Fold 2: 0.13805309734513274
Acuracia do modelo DecisionTreeClassifier() do Fold 3: 0.1368731563421829
Acuracia do modelo Decisi

{'Naive Bayes': 0.1759087493738521,
 'KNN': 0.14075151666944957,
 'Arvore de Decisão': 0.1396892219068292,
 'Floresta Aleatoria': 0.21885296654978573,
 'HistGradientBoosting': 0.2143702343184728,
 'LIGHTGBM': 0.2113033617187065,
 'XGB': 0.21779234151499974,
 'MLP': 0.10783902432236878,
 'SVC': 0.1819263093449101}

# TESTE4: Recursive Feature Elimination - (RFE)

- RFE

In [28]:
X = df.drop(columns=['instrumento','file_name','labels'])
y = df['labels'].to_frame()

clf = DecisionTreeClassifier(max_depth=17,
                             max_features='auto', 
                             min_samples_leaf=2,
                             min_samples_split=3)
                               
                               
estimators = RFECV(estimator=clf, scoring='accuracy', cv=5)

# fit rfecv
rfecv_data = estimators.fit(X, y)

# gorresponde à posição do ranking do i-ésimo recurso
ranking_features = rfecv_data.ranking_

# A máscara dos recursos selecionados.
support_features = rfecv_data.support_

# filtro
features_selects = X.columns[support_features]

In [47]:
features_selects

Index(['mfcc0', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'chroma9',
       'chroma11', 'spectral_band4', 'spectral_onset', 'spectral_rolloff',
       'mel_spectogram', 'zero_crossing_rate'],
      dtype='object')

In [46]:
ranking_features.shape

(44,)

- Count Ranking Features

In [48]:
# Couts da features import 
unique, frequency = np.unique(ranking_features, return_counts=True)
summary = np.asarray((unique, frequency)).T
summary

array([[ 1, 13],
       [ 2,  1],
       [ 3,  1],
       [ 4,  1],
       [ 5,  1],
       [ 6,  1],
       [ 7,  1],
       [ 8,  1],
       [ 9,  1],
       [10,  1],
       [11,  1],
       [12,  1],
       [13,  1],
       [14,  1],
       [15,  1],
       [16,  1],
       [17,  1],
       [18,  1],
       [19,  1],
       [20,  1],
       [21,  1],
       [22,  1],
       [23,  1],
       [24,  1],
       [25,  1],
       [26,  1],
       [27,  1],
       [28,  1],
       [29,  1],
       [30,  1],
       [31,  1],
       [32,  1]])

- Treinamento de Modelo

In [30]:
models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t4_acuracy_models = [cross_validate_balancead(k=5, 
                                              model=model,
                                              X=X, 
                                              y=y, 
                                              oversampling=True) for model in models]

dict_results_t4 = {
    'Naive Bayes': t4_acuracy_models[0],
    'KNN': t4_acuracy_models[1],
    'Arvore de Decisão': t4_acuracy_models[2],
    'Floresta Aleatoria': t4_acuracy_models[3],
    'HistGradientBoosting': t4_acuracy_models[4],
    'LIGHTGBM': t4_acuracy_models[5],
    'XGB': t4_acuracy_models[6],
    'MLP': t4_acuracy_models[7],
    'SVC': t4_acuracy_models[8],
}

dict_results_t4

Acuracia do modelo GaussianNB() do Fold 0: 0.23820754716981132
Acuracia do modelo GaussianNB() do Fold 1: 0.25014749262536873
Acuracia do modelo GaussianNB() do Fold 2: 0.2584070796460177
Acuracia do modelo GaussianNB() do Fold 3: 0.25545722713864305
Acuracia do modelo GaussianNB() do Fold 4: 0.26430678466076696
Acuracia do modelo KNeighborsClassifier() do Fold 0: 0.11556603773584906
Acuracia do modelo KNeighborsClassifier() do Fold 1: 0.11976401179941003
Acuracia do modelo KNeighborsClassifier() do Fold 2: 0.12507374631268436
Acuracia do modelo KNeighborsClassifier() do Fold 3: 0.11445427728613569
Acuracia do modelo KNeighborsClassifier() do Fold 4: 0.12979351032448377
Acuracia do modelo DecisionTreeClassifier() do Fold 0: 0.15919811320754718
Acuracia do modelo DecisionTreeClassifier() do Fold 1: 0.17168141592920355
Acuracia do modelo DecisionTreeClassifier() do Fold 2: 0.16047197640117994
Acuracia do modelo DecisionTreeClassifier() do Fold 3: 0.15103244837758112
Acuracia do modelo De

{'Naive Bayes': 0.2533052262481216,
 'KNN': 0.12093031669171257,
 'Arvore de Decisão': 0.16399301497189295,
 'Floresta Aleatoria': 0.3146548533422385,
 'HistGradientBoosting': 0.3371888740468637,
 'LIGHTGBM': 0.3363619413368954,
 'XGB': 0.33530075972616463,
 'MLP': 0.13804008738242335,
 'SVC': 0.1593918016363333}