In [1]:
SCRIPT_DIR = os.path.dirname('../src/')
sys.path.append(os.path.dirname(SCRIPT_DIR))

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from pyod.models.knn import KNN 

from sklearn.feature_selection import RFECV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import (
    RandomForestClassifier,
    HistGradientBoostingClassifier)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from src.wrapped import Wrapped
from src.train import cross_validate_balancead, permutation, combination

In [2]:
wp = Wrapped(
    '../data/row/',
    '../data/processed/',
    '../data/files/'
)

# Importando dados para treinamento

In [3]:
df = wp.load_data('df_instrumentos_features_selecionadas')
df

Unnamed: 0,tonnetz0,tonnetz1,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,mfcc0,...,spectral_band4,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,instrumento,file_name
0,0.010380,0.038218,0.513887,0.581963,0.434641,0.293949,0.317826,0.271148,0.548258,-24.170067,...,3409.640722,2600.055158,1.098243,5260.821321,-44.640682,-30.202150,0.149672,33095,accordion,000139_119040.ogg
1,0.084643,-0.035337,0.175587,0.429345,0.840043,0.384524,0.282545,0.346341,0.482365,-90.556145,...,3516.595715,2385.271420,1.360742,5225.124166,-46.575737,-59.323132,0.112181,24785,accordion,000145_172800.ogg
2,-0.030127,0.082404,0.453814,0.219808,0.228484,0.446050,0.178916,0.250981,0.327460,-106.124260,...,3256.491175,2069.850731,1.343197,4287.155824,-56.276707,-71.512909,0.096007,21230,accordion,000201_168960.ogg
3,0.157274,-0.066099,0.209729,0.338755,0.660264,0.256625,0.275209,0.180417,0.399448,-66.319801,...,3855.791675,2903.979876,1.218963,6751.133809,-42.723160,-44.649315,0.144010,31836,accordion,000212_211200.ogg
4,-0.049096,-0.029067,0.484400,0.334572,0.441629,0.679485,0.382958,0.326033,0.205775,-226.263824,...,2883.225952,1251.374287,1.339527,2263.034730,-62.227947,-78.196365,0.059804,13214,accordion,000640_49920.ogg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8471,0.070358,0.003048,0.332754,0.359639,0.473210,0.512169,0.607755,0.500391,0.378782,-87.742729,...,3664.812127,3194.922838,1.253166,6714.437434,-53.587788,-76.614616,0.134979,29855,voice,153341_184320.ogg
8472,0.006115,-0.030219,0.161793,0.190661,0.249882,0.378161,0.461520,0.332309,0.294171,-179.792145,...,2879.597305,1351.326334,1.450944,2412.643029,-52.071827,-76.029976,0.061211,13533,voice,153412_134400.ogg
8473,-0.008339,0.005653,0.589552,0.309548,0.187583,0.169132,0.240637,0.332375,0.333517,-153.614334,...,3274.875402,2685.831528,1.274003,4614.225695,-66.120743,-80.000000,0.157649,34869,voice,153478_30720.ogg
8474,0.108185,0.025366,0.615495,0.541314,0.467334,0.611199,0.791296,0.678383,0.508162,-80.250626,...,3376.494191,1428.726581,1.081204,3358.205447,-30.260586,-13.068514,0.032283,7136,voice,153986_203520.ogg


- Definindo as variaveis de treino e o target

In [16]:
le = LabelEncoder()
df['labels'] = le.fit_transform(df.instrumento)

In [5]:
X = df.drop(columns=['labels', 'instrumento','file_name'])
y = df['labels'].to_frame()

# TESTE 1: Balanceamento

- 1° Opção: Balanceamento das classes com SMOTH

In [None]:
t1_models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t1_acuracy_models = [cross_validate_balancead(k=5, model=model, X=X, y=y, oversampling=True) for model in t1_models]

dict_results_t1 = {
    'Naive Bayes': t1_acuracy_models[0],
    'KNN': t1_acuracy_models[1],
    'Arvore de Decisão': t1_acuracy_models[2],
    'Floresta Aleatoria': t1_acuracy_models[3],
    'HistGradientBoosting': t1_acuracy_models[4],
    'LIGHTGBM': t1_acuracy_models[5],
    'XGB': t1_acuracy_models[6],
    'MLP': t1_acuracy_models[7],
    'SVC': t1_acuracy_models[8],
}

dict_results_t1

- 2° Opção: Class Weight

In [None]:
models_class_weight = np.array([
    DecisionTreeClassifier(class_weight='balanced'), 
    RandomForestClassifier(class_weight='balanced', random_state = 0, n_jobs = -1), 
    LGBMClassifier(class_weight='balanced'),
])

t1_acuracy_models_class_weight = [cross_validate_balancead(k=5, model=model, X=X, y=y, oversampling=True) for model in models_class_weight]

dict_results_t1_class_weight = {
    'Arvore de Decisão': t1_acuracy_models_class_weight[0],
    'Floresta Aleatoria': t1_acuracy_models_class_weight[1],
    'LIGHTGBM': t1_acuracy_models_class_weight[2],
}

dict_results_t1_class_weight

# TESTE 2: Remover Outlier

- Treinar modelo de classificação de outilier

In [None]:
detector_outilier = KNN()

new_df = df.drop(columns=['instrumento','file_name'])

detector_outilier.fit(new_df)

- Checar numeros de outilers e não outilers<br/>
**0 (False) Não é outiler**<br/>
**1 (True) É outiler**

In [None]:
previsions = detector_outilier.labels_
np.unique(previsions, return_counts=True)

- Distancia euclidiana dos registros

In [None]:
confiance_previsions = detector_outilier.decision_scores_
confiance_previsions

- Checar os outilers

In [None]:
# pegando indice dos registros que são outilers
outilers_id =[previsions[i] == 1 for i in range(previsions.shape[0])]
outilers = df.iloc[outilers_id,:]
outilers

In [None]:
outilers.instrumento.value_counts()

- Agora vamos remover os outilers da nossa base

In [None]:
# pegando indice dos registros que não são outilers
outilers_id =[previsions[i] == 0 for i in range(previsions.shape[0])]
df_train = df.iloc[outilers_id,:]
df_train

- Treinar modelos sem outlier e balanceameto

In [None]:
X_rm_outilers = df_train.drop(columns=['labels', 'instrumento','file_name'])
y_rm_outilers = df_train['labels'].to_frame()

models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t2_acuracy_models = [cross_validate_balancead(k=5, model=model, X=X_rm_outilers,  y=y_rm_outilers) for model in models]

dict_results_t2 = {
    'Naive Bayes': t2_acuracy_models[0],
    'KNN': t2_acuracy_models[1],
    'Arvore de Decisão': t2_acuracy_models[2],
    'Floresta Aleatoria': t2_acuracy_models[3],
    'HistGradientBoosting': t2_acuracy_models[4],
    'LIGHTGBM': t2_acuracy_models[5],
    'XGB': t2_acuracy_models[6],
    'MLP': t2_acuracy_models[7],
    'SVC': t2_acuracy_models[8],
}

dict_results_t2

# TESTE 3: Features de Maior Correlação

In [None]:
df.corr()

In [None]:
df_corr = df.corr().unstack().reset_index().dropna()
df_corr.rename(columns = {'level_0': 'features_a', 'level_1': 'features_b', 0:'correlacao'}, inplace = True)
df_corr

- Correlação Positia

In [None]:
corr_posit = df_corr.query('correlacao > 0.7 and correlacao < 1.0')
corr_posit

In [None]:
X =  df[corr_posit.features_a.unique()]
y = df['labels'].to_frame()

models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t3_acuracy_models_corr_posit = [cross_validate_balancead(k=5, model=model, X=X, y=y) for model in models]

dict_results_t3_corr_posit = {
    'Naive Bayes': t3_acuracy_models_corr_posit[0],
    'KNN': t3_acuracy_models_corr_posit[1],
    'Arvore de Decisão': t3_acuracy_models_corr_posit[2],
    'Floresta Aleatoria': t3_acuracy_models_corr_posit[3],
    'HistGradientBoosting': t3_acuracy_models_corr_posit[4],
    'LIGHTGBM': t3_acuracy_models_corr_posit[5],
    'XGB': t3_acuracy_models_corr_posit[6],
    'MLP': t3_acuracy_models_corr_posit[7],
    'SVC': t3_acuracy_models_corr_posit[8],
}

dict_results_t3_corr_posit

- Correlação Negativa

In [None]:
corr_negat = df_corr.query('correlacao < 0 and correlacao < -0.66')
corr_negat

In [None]:
X =  df[corr_negat.features_b.unique()]
y = df['labels'].to_frame()

models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t3_acuracy_models_corr_neg = [cross_validate_balancead(k=5, model=model, X=X, y=y) for model in t1_models]

dict_results_t3_corr_neg = {
    'Naive Bayes': t3_acuracy_models_corr_neg[0],
    'KNN': t3_acuracy_models_corr_neg[1],
    'Arvore de Decisão': t3_acuracy_models_corr_neg[2],
    'Floresta Aleatoria': t3_acuracy_models_corr_neg[3],
    'HistGradientBoosting': t3_acuracy_models_corr_neg[4],
    'LIGHTGBM': t3_acuracy_models_corr_neg[5],
    'XGB': t3_acuracy_models_corr_neg[6],
    'MLP': t3_acuracy_models_corr_neg[7],
    'SVC': t3_acuracy_models_corr_neg[8],
}

dict_results_t3_corr_neg

- Se combinarmos as duas abordagens ?

In [None]:
features = np.append(corr_posit.features_b.values, corr_negat.features_b.values)
features = np.unique(features)

X = df[features]
y = df['labels'].to_frame()

models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t3_combine_corr = [cross_validate_balancead(k=5, model=model, X=X, y=y) for model in t1_models]

dict_results_t3_cb = {
    'Naive Bayes': t3_combine_corr[0],
    'KNN': t3_combine_corr[1],
    'Arvore de Decisão': t3_combine_corr[2],
    'Floresta Aleatoria': t3_combine_corr[3],
    'HistGradientBoosting': t3_combine_corr[4],
    'LIGHTGBM': t3_combine_corr[5],
    'XGB': t3_combine_corr[6],
    'MLP': t3_combine_corr[7],
    'SVC': t3_combine_corr[8],
}

dict_results_t3_cb

# TESTE4: Recursive Feature Elimination - (RFE)

- RFE

In [22]:
X = df.drop(columns=['instrumento','file_name','labels'])
y = df['labels'].to_frame()

clf = DecisionTreeClassifier(max_depth=17,
                             max_features='auto', 
                             min_samples_leaf=2,
                             min_samples_split=3)
                               
estimators = RFECV(estimator=clf, scoring='accuracy', cv=5)

# fit rfecv to data
rfecv_data = estimators.fit(X, y)

# gorresponde à posição do ranking do i-ésimo recurso
ranking_features = rfecv_data.ranking_

# A máscara dos recursos selecionados.
support_features = rfecv_data.support_

# filtro
features_selects = X.columns[support_features]

In [25]:
features_selects

Index(['chroma2', 'chroma4', 'chroma6', 'chroma7', 'mfcc0', 'mfcc1', 'mfcc3',
       'mfcc4', 'mfcc5', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 'mfcc12',
       'mfcc13', 'mfcc15', 'mfcc19', 'chroma8', 'chroma9', 'chroma11',
       'chroma12', 'spectral_band3', 'spectral_band4', 'spectral_onset',
       'spectral_rolloff', 'mel_spectogram', 'zero_crossing_rate',
       'zero_crossing'],
      dtype='object')

- Count Ranking Features

In [None]:
# Couts da features import 
unique, frequency = np.unique(ranking_features, return_counts=True)
summari = np.asarray((unique, frequency)).T
summari

- Treinamento de Modelo

In [None]:
models = np.array([
    GaussianNB(),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    MLPClassifier(),
    SVC(),
])

t4_acuracy_models = [cross_validate_balancead(k=5, model=model, X=X, y=y, oversampling=True) for model in models]

dict_results_t4 = {
    'Naive Bayes': t4_acuracy_models[0],
    'KNN': t4_acuracy_models[1],
    'Arvore de Decisão': t4_acuracy_models[2],
    'Floresta Aleatoria': t4_acuracy_models[3],
    'HistGradientBoosting': t4_acuracy_models[4],
    'LIGHTGBM': t4_acuracy_models[5],
    'XGB': t4_acuracy_models[6],
    'MLP': t4_acuracy_models[7],
    'SVC': t4_acuracy_models[8],
}

dict_results_t4

# TESTE 5: SequentialFeatureSelector

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html#sklearn.feature_selection.SequentialFeatureSelector

# TESTE 6: Permutação

- Permutação de 2 features

In [33]:
comb = combination(features_selects, 3)

In [34]:
comb.shape

(3276, 3)

In [32]:
comb.shape

(378, 2)

In [None]:
comb = combination(df.columns[:-3].values, 2)
for i in comb:
    print(f'Permutação {i}')
    
    # models = np.array([
    #     GaussianNB(),
    #     KNeighborsClassifier(), 
    #     DecisionTreeClassifier(), 
    #     RandomForestClassifier(), 
    #     HistGradientBoostingClassifier(),
    #     LGBMClassifier(),
    #     XGBClassifier(),
    #     MLPClassifier(),
    #     SVC(),
    # ])
    
    # t5_acuracy_models = [cross_validate_balancead(k=5, 
    #                                               model=model,
    #                                               X=df[i],
    #                                               y=df['labels'].to_frame(),
    #                                               oversampling=True) for model in models]
    
    # dict_results = {
    #     'Naive Bayes': t5_acuracy_models[0],
    #     'KNN': t5_acuracy_models[1],
    #     'Arvore de Decisão': t5_acuracy_models[2],
    #     'Floresta Aleatoria': t5_acuracy_models[3],
    #     'HistGradientBoosting': t5_acuracy_models[4],
    #     'LIGHTGBM': t5_acuracy_models[5],
    #     'XGB': t5_acuracy_models[6],
    #     'MLP': t5_acuracy_models[7],
    #     'SVC': t5_acuracy_models[8],
    # }