In [2]:
SCRIPT_DIR = os.path.dirname('../src/')
sys.path.append(os.path.dirname(SCRIPT_DIR))

In [3]:
import warnings
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import  class_weight

from pyod.models.knn import KNN 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from src.wrapped import Wrapped

In [4]:
warnings.filterwarnings("ignore")

wp = Wrapped(
    '../data/row/',
    '../data/processed/',
    '../data/files/'
)

# Importando dados para treinamento

In [5]:
df = wp.load_data('df_instrumentos_features_selecionadas')
df

Unnamed: 0,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,chroma8,chroma9,chroma10,...,spectral_band4,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,instrumento,file_name
0,0.513887,0.581963,0.434641,0.293949,0.317826,0.271148,0.548258,0.660767,0.349147,0.252198,...,3409.640722,2600.055158,1.098243,5260.821321,-44.640682,-30.202150,0.149672,33095,accordion,000139_119040.ogg
1,0.175587,0.429345,0.840043,0.384524,0.282545,0.346341,0.482365,0.321750,0.299794,0.450477,...,3516.595715,2385.271420,1.360742,5225.124166,-46.575737,-59.323132,0.112181,24785,accordion,000145_172800.ogg
2,0.453814,0.219808,0.228484,0.446050,0.178916,0.250981,0.327460,0.333150,0.089059,0.103378,...,3256.491175,2069.850731,1.343197,4287.155824,-56.276707,-71.512909,0.096007,21230,accordion,000201_168960.ogg
3,0.209729,0.338755,0.660264,0.256625,0.275209,0.180417,0.399448,0.381560,0.275192,0.380790,...,3855.791675,2903.979876,1.218963,6751.133809,-42.723160,-44.649315,0.144010,31836,accordion,000212_211200.ogg
4,0.484400,0.334572,0.441629,0.679485,0.382958,0.326033,0.205775,0.309727,0.388397,0.299227,...,2883.225952,1251.374287,1.339527,2263.034730,-62.227947,-78.196365,0.059804,13214,accordion,000640_49920.ogg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8471,0.332754,0.359639,0.473210,0.512169,0.607755,0.500391,0.378782,0.420053,0.423016,0.570144,...,3664.812127,3194.922838,1.253166,6714.437434,-53.587788,-76.614616,0.134979,29855,voice,153341_184320.ogg
8472,0.161793,0.190661,0.249882,0.378161,0.461520,0.332309,0.294171,0.251455,0.106215,0.089914,...,2879.597305,1351.326334,1.450944,2412.643029,-52.071827,-76.029976,0.061211,13533,voice,153412_134400.ogg
8473,0.589552,0.309548,0.187583,0.169132,0.240637,0.332375,0.333517,0.585136,0.280368,0.236353,...,3274.875402,2685.831528,1.274003,4614.225695,-66.120743,-80.000000,0.157649,34869,voice,153478_30720.ogg
8474,0.615495,0.541314,0.467334,0.611199,0.791296,0.678383,0.508162,0.543337,0.463837,0.352893,...,3376.494191,1428.726581,1.081204,3358.205447,-30.260586,-13.068514,0.032283,7136,voice,153986_203520.ogg


- Algumas informações

In [5]:
print('Dimensões do dataframe', df.shape)
print('Total de instrumentos na base original:', len(df.instrumento.unique()))
df.instrumento.value_counts()

Dimensões do dataframe (8476, 32)
Total de instrumentos na base original: 20


mallet_percussion    719
flute                654
piano                628
organ                611
saxophone            539
accordion            511
bass                 508
cymbals              503
cello                481
violin               469
voice                452
ukulele              389
banjo                375
synthesizer          375
guitar               347
drums                336
trombone             196
mandolin             163
clarinet             152
trumpet               68
Name: instrumento, dtype: int64

# Preprocessamento

In [9]:
le = LabelEncoder()

df['labels'] = le.fit_transform(df.instrumento)

# Split

In [7]:
X = df.drop(columns=['labels', 'instrumento','file_name'])
y = df['labels']

# TESTE 1: Balanceamento

- 1° Opção: Balanceamento das classes com SMOTH

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [9]:
sm = SMOTE()
X_train, y_train  = sm.fit_resample(X_train, y_train)

In [11]:
models = np.array([KNeighborsClassifier(),
                   DecisionTreeClassifier(), 
                   RandomForestClassifier(), 
                   LGBMClassifier(),MLPClassifier()])


for model in models:
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    print(f'Acuracia {model}: {accuracy_score(y_test, predict).round(2) * 100}%')
    

Acuracia KNeighborsClassifier(): 12.0%
Acuracia DecisionTreeClassifier(): 18.0%
Acuracia RandomForestClassifier(): 30.0%
Acuracia LGBMClassifier(): 33.0%
Acuracia MLPClassifier(): 18.0%


- Class Weight

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [13]:
#  Calculate weights using sklearn
sklearn_weights = class_weight.compute_class_weight(
    class_weight = 'balanced',
    classes = np.unique(y_train),
    y = y_train
)

sklearn_weights

array([0.88552239, 1.1498062 , 0.81497253, 0.93876582, 2.74675926,
       0.82402778, 1.22078189, 0.66364653, 1.10690299, 0.60417515,
       2.53547009, 0.67883295, 0.66069042, 0.77454308, 1.15878906,
       2.04586207, 5.70480769, 1.13659004, 0.88817365, 0.96003236])

In [15]:
models = np.array([
    KNeighborsClassifier(),
    DecisionTreeClassifier(class_weight= 'balanced'), 
    RandomForestClassifier(class_weight = 'balanced', random_state = 0, n_jobs = -1), 
    LogisticRegression(class_weight = 'balanced', random_state = 0, n_jobs = -1), 
    LGBMClassifier(class_weight= 'balanced'),
    MLPClassifier()
])


for model in models:
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    print(f'Acuracia {model}: {accuracy_score(y_test, predict).round(2) * 100}%')

Acuracia KNeighborsClassifier(): 15.0%
Acuracia DecisionTreeClassifier(class_weight='balanced'): 18.0%
Acuracia RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=0): 33.0%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Acuracia LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=0): 17.0%
Acuracia LGBMClassifier(class_weight='balanced'): 34.0%
Acuracia MLPClassifier(): 17.0%


# TESTE 2: Remover Outlier

- Treinar modelo de classificação de outilier

In [12]:
detector_outilier = KNN()

new_df = df.drop(columns=['instrumento','file_name'])

detector_outilier.fit(new_df)

KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
  radius=1.0)

- Checar numeros de outilers e não outilers<br/>
**0 (False) Não é outiler**<br/>
**1 (True) É outiler**

In [17]:
previsions = detector_outilier.labels_
np.unique(previsions, return_counts=True)

(array([0, 1]), array([7628,  848]))

- Distancia euclidiana dos registros

In [20]:
confiance_previsions = detector_outilier.decision_scores_
confiance_previsions

array([468.44818927, 261.53948376, 233.40525508, ..., 546.12167573,
       501.05261951, 244.9254647 ])

- Checar os outilers

In [21]:
outilers_id = []

# pegando o indice dos registros que são outilers 
for indice in range(len(previsions)):
    if previsions[indice] == 1:
        outilers_id.append(indice)

# outilers 
outilers = df.iloc[outilers_id,:]
outilers

Unnamed: 0,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,chroma8,chroma9,chroma10,...,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,instrumento,file_name,labels
28,0.446227,0.186154,0.310970,0.289374,0.480033,0.271526,0.262448,0.477942,0.288807,0.696978,...,3178.625677,1.504104,5658.636007,-64.840324,-79.991013,0.225914,49936,accordion,011547_157440.ogg,0
46,0.307783,0.359920,0.321503,0.340189,0.230577,0.230887,0.267941,0.397493,0.260939,0.225169,...,3317.306522,1.480623,6072.413242,-37.332355,-79.453758,0.249911,55267,accordion,019554_199680.ogg,0
78,0.193358,0.281352,0.774958,0.293572,0.316613,0.302946,0.372028,0.618087,0.326541,0.347372,...,2105.938025,1.162251,3689.097337,-49.523079,-69.013954,0.146228,32336,accordion,028377_42240.ogg,0
94,0.430660,0.239950,0.289489,0.156493,0.338560,0.262607,0.266463,0.478878,0.286081,0.352469,...,1835.825453,0.991421,3336.597303,-63.885098,-79.566399,0.125387,27748,accordion,032445_253440.ogg,0
101,0.365614,0.188457,0.210712,0.340728,0.218881,0.389993,0.138017,0.096201,0.430967,0.296048,...,2495.492458,1.521361,4334.344015,-64.275818,-79.999847,0.203175,44935,accordion,035590_49920.ogg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8408,0.223368,0.229950,0.303748,0.293760,0.263125,0.406126,0.506930,0.371646,0.349470,0.511544,...,3037.057890,1.260896,7336.652079,-53.212646,-69.924911,0.133118,29465,voice,128693_729600.ogg,19
8435,0.265711,0.153765,0.160796,0.181671,0.145868,0.225688,0.512277,0.147044,0.171986,0.231603,...,2272.639669,1.200361,3904.104621,-45.581455,-76.195732,0.212792,47053,voice,142108_149760.ogg,19
8453,0.484748,0.558405,0.458368,0.481970,0.493589,0.498251,0.470169,0.505856,0.606745,0.610374,...,1810.905441,1.761557,4142.593589,-45.505581,-29.093159,0.037906,8383,voice,147088_72960.ogg,19
8455,0.445147,0.216153,0.162184,0.219060,0.224821,0.265225,0.208129,0.126858,0.071191,0.232011,...,2759.989165,1.015649,6755.730224,-27.749546,-59.993065,0.164289,36301,voice,147642_53760.ogg,19


In [26]:
outilers.instrumento.value_counts()

flute                128
mallet_percussion    126
cymbals               86
synthesizer           83
violin                60
drums                 52
organ                 47
bass                  40
accordion             36
guitar                34
voice                 29
ukulele               20
banjo                 19
piano                 19
mandolin              16
clarinet              15
trombone              12
saxophone             11
trumpet                8
cello                  7
Name: instrumento, dtype: int64

- Agora vamos remover os outilers da nossa base

In [27]:
filter_id = []

# pegando indice dos registros que não são outilers
for indice in range(len(previsions)):
    if previsions[indice] == 0:
        filter_id.append(indice)

# removendo outilers  
df_train = df.iloc[filter_id,:]
df_train

Unnamed: 0,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,chroma8,chroma9,chroma10,...,spectral_centroid,spectral_onset,spectral_rolloff,spectogram,mel_spectogram,zero_crossing_rate,zero_crossing,instrumento,file_name,labels
0,0.513887,0.581963,0.434641,0.293949,0.317826,0.271148,0.548258,0.660767,0.349147,0.252198,...,2600.055158,1.098243,5260.821321,-44.640682,-30.202150,0.149672,33095,accordion,000139_119040.ogg,0
1,0.175587,0.429345,0.840043,0.384524,0.282545,0.346341,0.482365,0.321750,0.299794,0.450477,...,2385.271420,1.360742,5225.124166,-46.575737,-59.323132,0.112181,24785,accordion,000145_172800.ogg,0
2,0.453814,0.219808,0.228484,0.446050,0.178916,0.250981,0.327460,0.333150,0.089059,0.103378,...,2069.850731,1.343197,4287.155824,-56.276707,-71.512909,0.096007,21230,accordion,000201_168960.ogg,0
3,0.209729,0.338755,0.660264,0.256625,0.275209,0.180417,0.399448,0.381560,0.275192,0.380790,...,2903.979876,1.218963,6751.133809,-42.723160,-44.649315,0.144010,31836,accordion,000212_211200.ogg,0
4,0.484400,0.334572,0.441629,0.679485,0.382958,0.326033,0.205775,0.309727,0.388397,0.299227,...,1251.374287,1.339527,2263.034730,-62.227947,-78.196365,0.059804,13214,accordion,000640_49920.ogg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8470,0.431032,0.433562,0.439329,0.472623,0.436766,0.458280,0.480054,0.507706,0.550173,0.567656,...,2542.887346,1.234712,5289.648834,-53.747391,-70.919098,0.141731,31319,voice,151680_126720.ogg,19
8472,0.161793,0.190661,0.249882,0.378161,0.461520,0.332309,0.294171,0.251455,0.106215,0.089914,...,1351.326334,1.450944,2412.643029,-52.071827,-76.029976,0.061211,13533,voice,153412_134400.ogg,19
8473,0.589552,0.309548,0.187583,0.169132,0.240637,0.332375,0.333517,0.585136,0.280368,0.236353,...,2685.831528,1.274003,4614.225695,-66.120743,-80.000000,0.157649,34869,voice,153478_30720.ogg,19
8474,0.615495,0.541314,0.467334,0.611199,0.791296,0.678383,0.508162,0.543337,0.463837,0.352893,...,1428.726581,1.081204,3358.205447,-30.260586,-13.068514,0.032283,7136,voice,153986_203520.ogg,19


- Treinar modelos sem outlier

In [29]:
# definindo o target
X = df_train.drop(columns=['labels', 'instrumento','file_name'])
y = df_train['labels']

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

#  Calculate weights using sklearn
sklearn_weights = class_weight.compute_class_weight(
    class_weight = 'balanced',
    classes = np.unique(y_train),
    y = y_train
)

# models
models = np.array([
    KNeighborsClassifier(),
    DecisionTreeClassifier(class_weight= 'balanced'), 
    RandomForestClassifier(class_weight = 'balanced', random_state = 0, n_jobs = -1), 
    LogisticRegression(class_weight = 'balanced', random_state = 0, n_jobs = -1), 
    LGBMClassifier(class_weight= 'balanced'),
    MLPClassifier()
])

# treinamento
for model in models:
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    print(f'Acuracia {model}: {accuracy_score(y_test, predict).round(2) * 100}%')
    

Acuracia KNeighborsClassifier(): 13.0%
Acuracia DecisionTreeClassifier(class_weight='balanced'): 17.0%
Acuracia RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=0): 31.0%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Acuracia LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=0): 19.0%
Acuracia LGBMClassifier(class_weight='balanced'): 33.0%
Acuracia MLPClassifier(): 9.0%
