# libs

In [53]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    cross_val_score,
    cross_validate,
    train_test_split
)

from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from statsmodels.miscmodels.ordinal_model import OrderedModel
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import( 
    confusion_matrix, accuracy_score, classification_report 
)

# Import Data

In [2]:
df = pd.read_csv('data/features_sonds_mean.csv')
df.head()

Unnamed: 0,arquivo,species,fourier_tempogram-mean-0,fourier_tempogram-mean-1,fourier_tempogram-mean-2,fourier_tempogram-mean-3,fourier_tempogram-mean-4,fourier_tempogram-mean-5,fourier_tempogram-mean-6,fourier_tempogram-mean-7,...,mfcc-mean-19,tonnetz-mean-0,tonnetz-mean-1,tonnetz-mean-2,tonnetz-mean-3,tonnetz-mean-4,tonnetz-mean-5,rms-mean-0,zcr-mean-0,rolloff-mean-0
0,132608,flammea,(251.53309631347656+0j),(-125.76673126220703-1.2681530714035034j),(0.00612641079351306+0.9206206202507019j),(-0.012888919562101364-0.3558160066604614j),(0.008662317879498005+0.23633289337158203j),(-0.002033413853496313-0.20240235328674316j),(0.0008582753362134099+0.158883735537529j),(-0.0027650517877191305-0.11661908775568008j),...,3.678343,-0.006485,0.004259,-0.005751,0.00368,-0.00269,9e-06,0.010133,0.156673,10971.990206
1,132611,flammea,(259.70166015625+0j),(-129.85952758789062-3.4507150650024414j),(0.023019367828965187+2.1413066387176514j),(-0.01967422291636467-0.951308012008667j),(0.01633555442094803+0.7417126893997192j),(-0.022033946588635445-0.4451695382595062j),(0.020223163068294525+0.4447576105594635j),(-0.020403189584612846-0.4313928484916687j),...,6.539614,0.002514,-0.022335,-0.011952,0.058717,0.016824,-0.005542,0.013791,0.144492,11565.475525
2,35068,flammea,(190.72215270996094+0j),(-95.39623260498047-6.514892578125j),(0.07834384590387344+3.796746253967285j),(-0.06624338775873184-1.527251958847046j),(-0.0030891471542418003+1.7203675508499146j),(0.018300199881196022-1.6186463832855225j),(0.04079245775938034+1.5020774602890015j),(-0.039292965084314346-1.3524980545043945j),...,3.841586,0.004548,-0.009998,0.048268,-0.01549,-0.011625,-0.01095,0.010008,0.236547,11098.715496
3,82715,palustris,(323.50115966796875+0j),(-161.74093627929688-7.180373668670654j),(-0.0034139330964535475+2.821110486984253j),(-0.010709281079471111-0.30385562777519226j),(0.01010955311357975+0.538983166217804j),(-0.01608879491686821-0.3566131591796875j),(0.018824022263288498+0.2952796220779419j),(-0.006671360228210688-0.17688553035259247j),...,2.818051,0.003534,7.7e-05,0.016498,-0.00728,-0.005929,-0.001681,0.015728,0.235209,9576.714957
4,64685,palustris,(282.1729736328125+0j),(-141.0712127685547-4.479115962982178j),(-0.017938779667019844+2.4994139671325684j),(-0.0047923000529408455-0.6844111084938049j),(0.022898687049746513+0.4950515031814575j),(-0.03356371447443962-0.2774132788181305j),(0.026475373655557632+0.21031554043293j),(-0.005907224491238594-0.1529945284128189j),...,9.404188,0.005944,-0.007011,0.028142,-0.019186,0.0015,0.000639,0.011937,0.206531,9820.667665


# Baseline

- Pegar colunas numericas

In [3]:
numeric_cols = [cname for cname in df.columns if df[cname].dtype in ['float']]

- Criar labels numericas para coluna species  

In [4]:
le = LabelEncoder()

df['classes'] = le.fit_transform(df['species'])

In [58]:
df.species

0         flammea
1         flammea
2         flammea
3       palustris
4       palustris
          ...    
259    philomelos
260    philomelos
261      vanellus
262      vanellus
263      vanellus
Name: species, Length: 264, dtype: object

In [5]:
df[["classes", "species"]]

Unnamed: 0,classes,species
0,36,flammea
1,36,flammea
2,36,flammea
3,57,palustris
4,57,palustris
...,...,...
259,59,philomelos
260,59,philomelos
261,82,vanellus
262,82,vanellus


- Definindo X e y 

In [59]:
X

Unnamed: 0,tempogram-mean-0,tempogram-mean-1,tempogram-mean-2,tempogram-mean-3,tempogram-mean-4,tempogram-mean-5,tempogram-mean-6,tempogram-mean-7,tempogram-mean-8,tempogram-mean-9,...,mfcc-mean-19,tonnetz-mean-0,tonnetz-mean-1,tonnetz-mean-2,tonnetz-mean-3,tonnetz-mean-4,tonnetz-mean-5,rms-mean-0,zcr-mean-0,rolloff-mean-0
0,1.0,0.947106,0.916835,0.917622,0.918789,0.916606,0.914797,0.915190,0.915148,0.912482,...,3.678343,-0.006485,0.004259,-0.005751,0.003680,-0.002690,0.000009,0.010133,0.156673,10971.990206
1,1.0,0.928961,0.863120,0.832358,0.815666,0.804090,0.808824,0.824190,0.839667,0.855506,...,6.539614,0.002514,-0.022335,-0.011952,0.058717,0.016824,-0.005542,0.013791,0.144492,11565.475525
2,1.0,0.871185,0.745585,0.694683,0.664319,0.650915,0.663918,0.691141,0.713935,0.734078,...,3.841586,0.004548,-0.009998,0.048268,-0.015490,-0.011625,-0.010950,0.010008,0.236547,11098.715496
3,1.0,0.754161,0.538215,0.502754,0.491417,0.504126,0.560820,0.549658,0.502624,0.490642,...,2.818051,0.003534,0.000077,0.016498,-0.007280,-0.005929,-0.001681,0.015728,0.235209,9576.714957
4,1.0,0.821789,0.671557,0.644815,0.664769,0.675007,0.669493,0.679025,0.718727,0.732311,...,9.404188,0.005944,-0.007011,0.028142,-0.019186,0.001500,0.000639,0.011937,0.206531,9820.667665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,1.0,0.890840,0.799685,0.783629,0.780275,0.779956,0.784945,0.788965,0.791128,0.793413,...,2.510516,0.012072,0.019389,0.024667,0.007262,0.010036,-0.001262,0.030607,0.163760,6243.274109
260,1.0,0.909698,0.833969,0.821334,0.817004,0.809187,0.804778,0.805532,0.809391,0.811338,...,0.490833,0.006474,0.005587,0.008275,-0.018651,-0.001811,-0.001125,0.006916,0.143330,6246.110972
261,1.0,0.902936,0.818450,0.804781,0.803276,0.804081,0.798059,0.790607,0.782913,0.780841,...,-3.575015,0.001898,-0.003764,-0.031344,-0.010915,-0.002464,-0.000296,0.007711,0.113148,5100.927231
262,1.0,0.937399,0.893008,0.890007,0.888924,0.884784,0.880242,0.875642,0.871638,0.871033,...,-0.909196,0.004315,0.000150,-0.007994,-0.022980,0.001628,0.000102,0.010486,0.171336,9325.487130


In [60]:
y

0      36
1      36
2      36
3      57
4      57
       ..
259    59
260    59
261    82
262    82
263    82
Name: classes, Length: 264, dtype: int64

In [50]:
X = df[numeric_cols]
y = df["classes"]

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=7)

xgb = XGBClassifier(n_estimators=100)

xgb.fit(X_train, y_train)

predict_xgb = xgb.predict(X_test)

print(classification_report(y_test, predict_xgb))
print(f'Acuracia: {accuracy_score(y_test, predict_xgb).round(3) * 100}%')

- Validação cruzada

In [None]:
cross_val_score(LogisticRegression(random_state=42), X, y, cv=kfold, scoring='accuracy')

In [None]:
metrics = ["precision", "recall", "accuracy"]
cross_validate(LogisticRegression(random_state=42), X, y, cv=kfold, scoring=metrics)

# Feature Selection

### Por Correlação

In [9]:
matrix_corr = df.corr()

- Transformando matrix em df de correlação 

In [135]:
df_corr = matrix_corr.unstack().reset_index()
df_corr = df_corr.rename(columns={
    "level_0":"column_A",
    "level_1":"column_B",
    0:"correlation"
})

- filtrnado colunas por correlação

In [139]:
df_corr = df_corr.query('correlation >= 0.8')

- salvando as colunas com correlação superior a 0.8

In [140]:
columns = np.append(df_corr.column_A.unique(), df_corr.column_B.unique())

columns = np.unique(columns)

columns.shape

(577,)

In [141]:
df[columns]

Unnamed: 0,arquivo,chroma_cens-mean-0,chroma_cens-mean-1,chroma_cens-mean-10,chroma_cens-mean-11,chroma_cens-mean-2,chroma_cens-mean-3,chroma_cens-mean-4,chroma_cens-mean-5,chroma_cens-mean-6,...,tempogram-mean-97,tempogram-mean-98,tempogram-mean-99,tonnetz-mean-0,tonnetz-mean-1,tonnetz-mean-2,tonnetz-mean-3,tonnetz-mean-4,tonnetz-mean-5,zcr-mean-0
0,132608,0.290500,0.302208,0.271866,0.279251,0.295448,0.302315,0.282176,0.303334,0.272935,...,0.588538,0.583231,0.576977,-0.006485,0.004259,-0.005751,0.003680,-0.002690,0.000009,0.156673
1,132611,0.273805,0.272782,0.253636,0.257318,0.219711,0.304515,0.429938,0.333695,0.253430,...,0.522135,0.517262,0.511741,0.002514,-0.022335,-0.011952,0.058717,0.016824,-0.005542,0.144492
2,35068,0.253587,0.154386,0.418276,0.435017,0.152453,0.186813,0.193586,0.230531,0.229312,...,0.420938,0.408788,0.402414,0.004548,-0.009998,0.048268,-0.015490,-0.011625,-0.010950,0.236547
3,82715,0.255200,0.183679,0.358269,0.349176,0.203586,0.243583,0.272173,0.294603,0.287688,...,0.334452,0.337975,0.337200,0.003534,0.000077,0.016498,-0.007280,-0.005929,-0.001681,0.235209
4,64685,0.273586,0.223561,0.385862,0.372384,0.234366,0.246436,0.227304,0.213006,0.243381,...,0.460957,0.442027,0.433649,0.005944,-0.007011,0.028142,-0.019186,0.001500,0.000639,0.206531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,132392,0.282924,0.296130,0.206657,0.226453,0.305719,0.301976,0.281181,0.264343,0.268254,...,0.516000,0.510879,0.507147,0.012072,0.019389,0.024667,0.007262,0.010036,-0.001262,0.163760
260,146756,0.237675,0.254253,0.253787,0.248985,0.308612,0.307744,0.304431,0.307591,0.294878,...,0.527521,0.520667,0.514967,0.006474,0.005587,0.008275,-0.018651,-0.001811,-0.001125,0.143330
261,27060,0.269157,0.272067,0.350235,0.299481,0.255183,0.196367,0.178329,0.185498,0.202449,...,0.519205,0.511951,0.507742,0.001898,-0.003764,-0.031344,-0.010915,-0.002464,-0.000296,0.113148
262,97982,0.230994,0.230331,0.323198,0.285062,0.248523,0.253347,0.266463,0.284847,0.295366,...,0.563089,0.558310,0.553380,0.004315,0.000150,-0.007994,-0.022980,0.001628,0.000102,0.171336
