# libs

In [109]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd
from collections import defaultdict
from extract_mir import table_missing, play_audio

from sklearn.preprocessing import ( 
    LabelEncoder,
    MinMaxScaler
)

from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    cross_val_score
)

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Import Data

In [2]:
df = pd.read_csv('data/features_sonds_mean_mel50.csv').drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,arquivo,species,tempogram-mean-0,tempogram-mean-1,tempogram-mean-2,tempogram-mean-3,tempogram-mean-4,tempogram-mean-5,tempogram-mean-6,tempogram-mean-7,...,mfcc-mean-49,tonnetz-mean-0,tonnetz-mean-1,tonnetz-mean-2,tonnetz-mean-3,tonnetz-mean-4,tonnetz-mean-5,rms-mean-0,zcr-mean-0,rolloff-mean-0
0,132608,flammea,1.0,0.947106,0.916835,0.917622,0.918789,0.916606,0.914797,0.91519,...,1.593964,-0.006485,0.004259,-0.005751,0.00368,-0.00269,9e-06,0.010133,0.156673,10971.990206
1,132611,flammea,1.0,0.928961,0.86312,0.832358,0.815666,0.80409,0.808824,0.82419,...,1.661841,0.002514,-0.022335,-0.011952,0.058717,0.016824,-0.005542,0.013791,0.144492,11565.475525
2,35068,flammea,1.0,0.871185,0.745585,0.694683,0.664319,0.650915,0.663918,0.691141,...,-0.142084,0.004548,-0.009998,0.048268,-0.01549,-0.011625,-0.01095,0.010008,0.236547,11098.715496
3,82715,palustris,1.0,0.754161,0.538215,0.502754,0.491417,0.504126,0.56082,0.549658,...,-0.630695,0.003534,7.7e-05,0.016498,-0.00728,-0.005929,-0.001681,0.015728,0.235209,9576.714957
4,64685,palustris,1.0,0.821789,0.671557,0.644815,0.664769,0.675007,0.669493,0.679025,...,-0.797602,0.005944,-0.007011,0.028142,-0.019186,0.0015,0.000639,0.011937,0.206531,9820.667665


# Baseline

- Criar labels numericas para coluna species  

In [3]:
le = LabelEncoder()

df['classes'] = le.fit_transform(df['species'])

- Definindo X e y da baseline

In [4]:
X_baseline = df.drop(columns=["arquivo", "species", "classes"])
y_baseline = df["classes"]

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

- Arvore de Decisão

In [5]:
decision_tree = DecisionTreeClassifier()

baseline_score = cross_val_score(decision_tree, X_baseline, y_baseline, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(baseline_score), 2) * 100}%')

score: 12.0%


# Feature Selection

### Por Correlação

In [6]:
matrix_corr = df[df.columns[2:607]].corr()
matrix_corr

Unnamed: 0,tempogram-mean-0,tempogram-mean-1,tempogram-mean-2,tempogram-mean-3,tempogram-mean-4,tempogram-mean-5,tempogram-mean-6,tempogram-mean-7,tempogram-mean-8,tempogram-mean-9,...,mfcc-mean-47,mfcc-mean-48,mfcc-mean-49,tonnetz-mean-0,tonnetz-mean-1,tonnetz-mean-2,tonnetz-mean-3,tonnetz-mean-4,tonnetz-mean-5,rms-mean-0
tempogram-mean-0,,,,,,,,,,,...,,,,,,,,,,
tempogram-mean-1,,1.000000,0.952074,0.783402,0.865017,0.858657,0.728189,0.826525,0.867477,0.771899,...,0.041752,-0.072765,-0.041005,-0.027526,0.075960,-0.255079,0.114612,0.137314,0.067647,-0.143351
tempogram-mean-2,,0.952074,1.000000,0.932159,0.957960,0.931522,0.866526,0.921862,0.943328,0.902420,...,0.008842,0.004089,-0.052422,-0.014004,0.080942,-0.235921,0.140508,0.135522,0.082023,-0.150398
tempogram-mean-3,,0.783402,0.932159,1.000000,0.957687,0.902330,0.929309,0.932136,0.915879,0.948141,...,-0.032445,0.092304,-0.056606,-0.007944,0.082002,-0.195687,0.137238,0.136450,0.083986,-0.137048
tempogram-mean-4,,0.865017,0.957960,0.957687,1.000000,0.972271,0.934538,0.975583,0.966162,0.944666,...,0.004928,0.045084,-0.063145,-0.007183,0.074791,-0.204081,0.126997,0.107263,0.072630,-0.155259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tonnetz-mean-2,,-0.255079,-0.235921,-0.195687,-0.204081,-0.183585,-0.159286,-0.203316,-0.211473,-0.195042,...,-0.026824,0.190417,-0.018650,0.475604,-0.349256,1.000000,-0.278862,-0.653105,-0.490983,-0.253646
tonnetz-mean-3,,0.114612,0.140508,0.137238,0.126997,0.125510,0.118634,0.130747,0.152845,0.147366,...,0.213339,-0.000409,0.062163,0.033636,0.112342,-0.278862,1.000000,0.087158,0.360819,0.153998
tonnetz-mean-4,,0.137314,0.135522,0.136450,0.107263,0.090856,0.119918,0.130391,0.112696,0.126580,...,-0.095632,-0.235287,0.058894,-0.526424,0.508047,-0.653105,0.087158,1.000000,0.111689,0.230621
tonnetz-mean-5,,0.067647,0.082023,0.083986,0.072630,0.069660,0.064428,0.064532,0.090888,0.095759,...,0.172656,0.072305,0.050766,-0.094386,0.333758,-0.490983,0.360819,0.111689,1.000000,0.094825


- Transformando matrix em df de correlação 

In [8]:
df_corr = matrix_corr.unstack().reset_index()
df_corr = df_corr.rename(columns={
    "level_0":"column_A",
    "level_1":"column_B",
    0:"correlation"
})

df_corr

Unnamed: 0,column_A,column_B,correlation
0,tempogram-mean-0,tempogram-mean-0,
1,tempogram-mean-0,tempogram-mean-1,
2,tempogram-mean-0,tempogram-mean-2,
3,tempogram-mean-0,tempogram-mean-3,
4,tempogram-mean-0,tempogram-mean-4,
...,...,...,...
366020,rms-mean-0,tonnetz-mean-2,-0.253646
366021,rms-mean-0,tonnetz-mean-3,0.153998
366022,rms-mean-0,tonnetz-mean-4,0.230621
366023,rms-mean-0,tonnetz-mean-5,0.094825


In [9]:
df_corr.column_A.unique().shape

(605,)

- Pegar correlações maiores que 0.9

In [10]:
df_corr = df_corr.query('correlation >= 0.9')
df_corr

Unnamed: 0,column_A,column_B,correlation
606,tempogram-mean-1,tempogram-mean-1,1.000000
607,tempogram-mean-1,tempogram-mean-2,0.952074
1211,tempogram-mean-2,tempogram-mean-1,0.952074
1212,tempogram-mean-2,tempogram-mean-2,1.000000
1213,tempogram-mean-2,tempogram-mean-3,0.932159
...,...,...,...
363600,tonnetz-mean-2,tonnetz-mean-2,1.000000
364206,tonnetz-mean-3,tonnetz-mean-3,1.000000
364812,tonnetz-mean-4,tonnetz-mean-4,1.000000
365418,tonnetz-mean-5,tonnetz-mean-5,1.000000


In [11]:
df_corr.column_A.unique().shape

(604,)

- Colunas com correlação superior a 0.9

In [12]:
columns = df_corr.column_A.unique()

- Pegar X e y 

In [13]:
X = df[columns]
y = df['classes']

#### Teste 1

- Arvore de decisão

In [15]:
score = cross_val_score(DecisionTreeClassifier(), X, y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 11.0%


- Random Forest

In [17]:
score = cross_val_score(RandomForestClassifier(), X, y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 18.0%


- Regressão Logistica

In [110]:
score = cross_val_score(LogisticRegression(), X, y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 16.0%


- knn

In [18]:
select_k = defaultdict(list)

for i in range(1, 100):
    score = cross_val_score(KNeighborsClassifier(n_neighbors=i), X, y, cv=kfold, scoring="accuracy")
    
    select_k['k'].append(i)
    select_k['acuracia'].append(round(np.mean(score), 2) * 100)

df_select_k = pd.DataFrame(select_k)
df_select_k.sort_values(by='acuracia', ascending=False)

Unnamed: 0,k,acuracia
0,1,16.0
1,2,10.0
2,3,9.0
3,4,8.0
4,5,8.0
...,...,...
75,76,0.0
76,77,0.0
72,73,0.0
74,75,0.0


- XGBoost

In [19]:
score = cross_val_score(XGBClassifier(), X, y,  scoring='accuracy', cv=kfold)



In [20]:
print(f'score: {round(np.mean(score), 2) * 100}%')

score: 20.0%


- LGBM

In [21]:
score = cross_val_score(LGBMClassifier(), X, y, scoring='accuracy', cv=kfold)
print(f'score: {round(np.mean(score), 2) * 100}%')

score: 19.0%


## Normalização de features

### Teste 2

In [22]:
scaler = MinMaxScaler(feature_range=(0, 1))

X_norm = scaler.fit_transform(X)

- Regressão Logitica

In [23]:
score = cross_val_score(LogisticRegression(), X_norm, y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 17.0%


- KNN

In [24]:
select_k = defaultdict(list)

for i in range(1, 100):
    score = cross_val_score(KNeighborsClassifier(n_neighbors=i), X_norm, y, cv=kfold, scoring="accuracy")
    
    select_k['k'].append(i)
    select_k['acuracia'].append(round(np.mean(score), 2) * 100)

df_select_k = pd.DataFrame(select_k)
df_select_k.sort_values(by='acuracia', ascending=False)

Unnamed: 0,k,acuracia
0,1,13.0
6,7,9.0
2,3,9.0
3,4,9.0
7,8,8.0
...,...,...
68,69,0.0
69,70,0.0
70,71,0.0
71,72,0.0


- Arvore de Decisão

In [25]:
score = cross_val_score(DecisionTreeClassifier(random_state=42), X_norm, y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 9.0%


- LGBM

In [26]:
score = cross_val_score(LGBMClassifier(), X_norm, y, scoring='accuracy', cv=kfold)
print(f'score: {round(np.mean(score), 2) * 100}%')

score: 19.0%


- Random Forest

In [27]:
score = cross_val_score(RandomForestClassifier(), X_norm, y, cv=kfold, scoring="accuracy")
print(f'score: {round(np.mean(score), 2) * 100}%')

score: 16.0%


# Tuning Hiperparamentros

- Arvore de Decisão

In [58]:
parametros = {
    "criterion": Categorical(['gini','entropy']),
    "max_depth": Integer(2, 17),
    "max_features": Categorical(['auto', 'sqrt', 'log2']),
    "min_samples_split": Integer(2, 10),
    "min_samples_leaf": Integer(1, 10)
    
}

decision_tree = DecisionTreeClassifier()

decision_tree_search = BayesSearchCV(
    decision_tree,
    parametros,
    n_iter=32,
    n_jobs=-1,
    cv=5,
    scoring='accuracy'
)

decision_tree_search.fit(X, y)

print(f'Acuracy: { decision_tree_search.best_score_ * 100}\n')
print(decision_tree_search.best_estimator_)
print(f'\n{ decision_tree_search.best_params_ }')

Acuracy: 13.236574746008708

DecisionTreeClassifier(max_depth=17, max_features='auto', min_samples_leaf=2,
                       min_samples_split=3)

OrderedDict([('criterion', 'gini'), ('max_depth', 17), ('max_features', 'auto'), ('min_samples_leaf', 2), ('min_samples_split', 3)])


- Random Foreste

In [33]:
parametros = {
    "criterion": Categorical(['gini','entropy']),
    "max_depth": Integer(6, 20),
    "min_samples_split": Integer(2, 10),
    "min_samples_leaf": Integer(2, 10),
    "max_features": Categorical(['auto', 'sqrt','log2']), 
    "bootstrap": Categorical([True, False]),
    "n_estimators": Integer(100, 500)
}

random_forest = RandomForestClassifier()

forest_bayes_search = BayesSearchCV(
    random_forest,
    parametros,
    n_iter=32,
    n_jobs=-1,
    cv=5,
    scoring='accuracy'
)

forest_bayes_search.fit(X, y)

BayesSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=32, n_jobs=-1,
              scoring='accuracy',
              search_spaces={'bootstrap': Categorical(categories=(True, False), prior=None),
                             'criterion': Categorical(categories=('gini', 'entropy'), prior=None),
                             'max_depth': Integer(low=6, high=20, prior='uniform', transform='normalize'),
                             'max_features': Categorical(categories=('auto', 'sqrt', 'log2'), prior=None),
                             'min_samples_leaf': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'min_samples_split': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=100, high=500, prior='uniform', transform='normalize')})

In [35]:
print(f'Acuracy: { forest_bayes_search.best_score_ * 100}\n')
print(forest_bayes_search.best_estimator_)
print(f'\n{ forest_bayes_search.best_params_ }')

Acuracy: 24.985486211901303

RandomForestClassifier(bootstrap=False, max_depth=20, min_samples_leaf=2)

OrderedDict([('bootstrap', False), ('criterion', 'gini'), ('max_depth', 20), ('max_features', 'auto'), ('min_samples_leaf', 2), ('min_samples_split', 2), ('n_estimators', 100)])


- MLP

In [None]:
parametros = {
    'activation' : Categorical(['identity','logistic','tanh','relu']),
    'hidden_layer_sizes': Integer(20, 100)
}


mlp_bayes_search = BayesSearchCV(
    MLPClassifier(),
    parametros,
    n_iter=32,
    n_jobs=-1,
    cv=5,
    scoring='accuracy'
)

mlp_bayes_search.fit(X, y)

In [118]:
print(f'Acuracy: { mlp_bayes_search.best_score_ * 100}\n')
print(mlp_bayes_search.best_estimator_)
print(f'\n{ mlp_bayes_search.best_params_ }')

Acuracy: 17.801161103047896

MLPClassifier(hidden_layer_sizes=100)

OrderedDict([('activation', 'relu'), ('hidden_layer_sizes', 100)])


# Recursive Feature Elimination - (RFE)

### Teste 3

In [None]:
clf = DecisionTreeClassifier(max_depth=17,
                             max_features='auto', 
                             min_samples_leaf=2,
                             min_samples_split=3)
                               
estimators = RFECV(estimator=clf, scoring='accuracy')

# fit rfecv to data
rfecv_data = estimators.fit(X, y)

# get the feature ranking
ranking_features = rfecv_data.ranking_
# print(ranking_features)

support_features = rfecv_data.support_
# print(support_features)

In [87]:
features_selects = X.columns[support_features]

- Arvore de decisão

In [100]:
clf = DecisionTreeClassifier(max_depth=17,
                             max_features='auto', 
                             min_samples_leaf=2,
                             min_samples_split=3)

score = cross_val_score(clf, X[features_selects], y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 12.0%


- knn

In [117]:
select_k = defaultdict(list)

for i in range(1, 100):
    score = cross_val_score(KNeighborsClassifier(n_neighbors=i), X[features_selects], y, cv=kfold, scoring="accuracy")
    
    select_k['k'].append(i)
    select_k['acuracia'].append(round(np.mean(score), 2) * 100)

df_select_k = pd.DataFrame(select_k)
df_select_k.sort_values(by='acuracia', ascending=False)

Unnamed: 0,k,acuracia
0,1,16.0
1,2,10.0
2,3,9.0
6,7,8.0
3,4,8.0
...,...,...
53,54,1.0
75,76,0.0
74,75,0.0
72,73,0.0


- LGBM

In [101]:
score = cross_val_score(LGBMClassifier(), X[features_selects], y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 17.0%


- Random Forest

In [102]:
clf = RandomForestClassifier(
    bootstrap=False,
    criterion='gini',
    max_depth=20,
    max_features='auto',
    min_samples_split=2,
    min_samples_leaf=2,
    n_estimators=100
)

score = cross_val_score(clf, X[features_selects], y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 23.0%


- XGBOOST

In [103]:
score = cross_val_score(XGBClassifier(), X[features_selects], y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 19.0%


- MLP

In [119]:
mlp = MLPClassifier(hidden_layer_sizes=100, activation='relu')

score = cross_val_score(mlp, X[features_selects], y, cv=kfold, scoring="accuracy")
print(f'score: { round(np.mean(score), 2) * 100}%')

score: 15.0%
