In [1]:
SCRIPT_DIR = os.path.dirname('../src/')
sys.path.append(os.path.dirname(SCRIPT_DIR))

In [2]:
import warnings
import numpy as np
import pandas as pd 

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from src.wrapped import Wrapped

In [3]:
warnings.filterwarnings("ignore")

wp = Wrapped(
    '../data/row/',
    '../data/processed/',
    '../data/files/'
)

# Importando dados para treinamento

In [4]:
df = wp.load_data('df_instrumentos_features_selecionadas')
df

Unnamed: 0,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,chroma8,chroma9,chroma10,...,spectral_band4,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,instrumento,file_name
0,0.513887,0.581963,0.434641,0.293949,0.317826,0.271148,0.548258,0.660767,0.349147,0.252198,...,3409.640722,2600.055158,1.098243,5260.821321,-44.640682,-30.202150,0.149672,33095,accordion,000139_119040.ogg
1,0.175587,0.429345,0.840043,0.384524,0.282545,0.346341,0.482365,0.321750,0.299794,0.450477,...,3516.595715,2385.271420,1.360742,5225.124166,-46.575737,-59.323132,0.112181,24785,accordion,000145_172800.ogg
2,0.453814,0.219808,0.228484,0.446050,0.178916,0.250981,0.327460,0.333150,0.089059,0.103378,...,3256.491175,2069.850731,1.343197,4287.155824,-56.276707,-71.512909,0.096007,21230,accordion,000201_168960.ogg
3,0.209729,0.338755,0.660264,0.256625,0.275209,0.180417,0.399448,0.381560,0.275192,0.380790,...,3855.791675,2903.979876,1.218963,6751.133809,-42.723160,-44.649315,0.144010,31836,accordion,000212_211200.ogg
4,0.484400,0.334572,0.441629,0.679485,0.382958,0.326033,0.205775,0.309727,0.388397,0.299227,...,2883.225952,1251.374287,1.339527,2263.034730,-62.227947,-78.196365,0.059804,13214,accordion,000640_49920.ogg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8471,0.332754,0.359639,0.473210,0.512169,0.607755,0.500391,0.378782,0.420053,0.423016,0.570144,...,3664.812127,3194.922838,1.253166,6714.437434,-53.587788,-76.614616,0.134979,29855,voice,153341_184320.ogg
8472,0.161793,0.190661,0.249882,0.378161,0.461520,0.332309,0.294171,0.251455,0.106215,0.089914,...,2879.597305,1351.326334,1.450944,2412.643029,-52.071827,-76.029976,0.061211,13533,voice,153412_134400.ogg
8473,0.589552,0.309548,0.187583,0.169132,0.240637,0.332375,0.333517,0.585136,0.280368,0.236353,...,3274.875402,2685.831528,1.274003,4614.225695,-66.120743,-80.000000,0.157649,34869,voice,153478_30720.ogg
8474,0.615495,0.541314,0.467334,0.611199,0.791296,0.678383,0.508162,0.543337,0.463837,0.352893,...,3376.494191,1428.726581,1.081204,3358.205447,-30.260586,-13.068514,0.032283,7136,voice,153986_203520.ogg


- Algumas informações

In [5]:
print('Dimensões do dataframe', df.shape)
print('Total de instrumentos na base original:', len(df.instrumento.unique()))
df.instrumento.value_counts()

Dimensões do dataframe (8476, 32)
Total de instrumentos na base original: 20


mallet_percussion    719
flute                654
piano                628
organ                611
saxophone            539
accordion            511
bass                 508
cymbals              503
cello                481
violin               469
voice                452
ukulele              389
banjo                375
synthesizer          375
guitar               347
drums                336
trombone             196
mandolin             163
clarinet             152
trumpet               68
Name: instrumento, dtype: int64

# Preprocessamento

In [6]:
le = LabelEncoder()

df['labels'] = le.fit_transform(df.instrumento)

# Treinando modelos

In [7]:
X = df.drop(columns=['labels', 'file_name'])
y = df['labels']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [9]:
instrumentos_test = X_test['instrumento']
proporcao_dados_instrumentos_treino = X_train.instrumento.value_counts()

X_train = X_train.drop(columns=['instrumento'])
X_test = X_test.drop(columns=['instrumento'])

- Baseline

In [10]:
models = np.array([KNeighborsClassifier(),DecisionTreeClassifier(), RandomForestClassifier(), LGBMClassifier(),MLPClassifier()])

In [11]:
for model in models:
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    print(f'Acuracia {model}: {accuracy_score(y_test, predict).round(2) * 100}%')
    

Acuracia KNeighborsClassifier(): 15.0%
Acuracia DecisionTreeClassifier(): 18.0%
Acuracia RandomForestClassifier(): 34.0%
Acuracia LGBMClassifier(): 34.0%
Acuracia MLPClassifier(): 15.0%


# Vamos analisar esses resultados

In [12]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
predict = model.predict(X_test)
print(f'Acuracia Random Forest: {accuracy_score(y_test, predict).round(2) * 100}%')

Acuracia Random Forest: 33.0%


In [13]:
# criar um dataframe de gabarito para auxiliar nas nossas analises
gabarito = X_test
gabarito['labels'] = y_test
gabarito['instrumentos'] = instrumentos_test

# filtrando os acertos
acertos = gabarito[predict == gabarito.labels]

# filtrando os erros
erros = gabarito[predict != gabarito.labels]

- Acerto total

In [14]:
print('Total de acertos: ', acertos.shape[0])

Total de acertos:  841


- Total de instrumentos indentificados

In [15]:
len(acertos.instrumentos.unique())

18

- Instrumentos não indentificados

mandolin e clarinete

- Proporção de dados por instrumentos usado no treino

In [16]:
proporcao_dados_instrumentos_treino

mallet_percussion    491
piano                449
flute                447
organ                437
saxophone            383
bass                 364
cymbals              360
accordion            335
violin               334
cello                316
voice                309
guitar               268
ukulele              261
banjo                258
synthesizer          256
drums                243
trombone             145
mandolin             117
clarinet             108
trumpet               52
Name: instrumento, dtype: int64

- Proporção de instrumentos no Test

In [17]:
gabarito.instrumentos.value_counts()

mallet_percussion    228
flute                207
piano                179
accordion            176
organ                174
cello                165
saxophone            156
bass                 144
voice                143
cymbals              143
violin               135
ukulele              128
synthesizer          119
banjo                117
drums                 93
guitar                79
trombone              51
mandolin              46
clarinet              44
trumpet               16
Name: instrumentos, dtype: int64

- Proporção de acertos por instrumentos

In [18]:
acertos.instrumentos.value_counts()

piano                117
flute                 91
cymbals               79
bass                  76
cello                 71
saxophone             67
violin                46
mallet_percussion     45
accordion             45
organ                 43
voice                 40
drums                 37
banjo                 23
ukulele               23
guitar                19
synthesizer           17
trumpet                1
trombone               1
Name: instrumentos, dtype: int64

- Proporção de erros por instrumentos

In [19]:
erros.instrumentos.value_counts()

mallet_percussion    183
accordion            131
organ                131
flute                116
ukulele              105
voice                103
synthesizer          102
banjo                 94
cello                 94
saxophone             89
violin                89
bass                  68
cymbals               64
piano                 62
guitar                60
drums                 56
trombone              50
mandolin              46
clarinet              44
trumpet               15
Name: instrumentos, dtype: int64