# Book 2 - Selecionamento de Features, Validação, Balanceamento

Este notebook serviu como registro prático e teórico no meu aprendizado de Machine Learning.

`Enriqueci este notebook com anotações adicionais e aplicações práticas tornando-o uma referência valiosa para consultas e implementações em futuros projetos reais.`

Espero que este material inspire outros a explorar ainda mais o fascinante mundo do Machine Learning. 

No notebook presente tem todos os topicos dos notebook anteriores, porém sendo acrescentado e aprofundado com anotações dos seguintes tópicos:

**Técnicas de Balanceamento de Dados**  
- **Oversampling - Upsampling**: Criação de dados sintéticos.
- **Undersampling - Downsampling**: Redução de amostras na classe majoritária.

Compartilhar conhecimento é uma alegria—viva ao aprendizado contínuo, boa pratica e bons estudo a quem estiver lendo, abraços!

# Funções, bibliotecas e Dataframe ficticios

In [29]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import matplotlib.font_manager as fm #para alterar as fontes dos graficos
import seaborn as sns

import pyperclip

# Definindo o estilo de fonte e tema
fonte = fm.FontProperties(family="Calibri", style="italic")

sns.set_style('whitegrid') #cor da grade(fundo)

In [30]:
# Manipulação e Tratamento de dados
import openpyxl
import pandas as pd
import numpy as np
from numpy import NaN

#ignorando Warning inuteis
import warnings 
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [31]:
import requests

# CARREGANDO BASE
arquivo = 'fake_database'
url = "https://raw.githubusercontent.com/GabrielGabes/functions_gsa/main/" + arquivo + ".py"
print(url)
#response = requests.get(url); code = response.text; exec(code)
#df = fake_database2(2250); df_bck = df.copy(); display(df.head())

arquivo = 'funcoes_estatisticas'
url = "https://raw.githubusercontent.com/GabrielGabes/functions_gsa/main/" + arquivo + ".py"
print(url)
response = requests.get(url); code = response.text; exec(code)

# Função para avaliação de modelos exibindo metricas de avaliação
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, RocCurveDisplay
arquivo = 'ML_supervised_learning'
url = "https://raw.githubusercontent.com/GabrielGabes/functions_gsa/main/" + arquivo + ".py"
print(url); response = requests.get(url); code = response.text; exec(code)

print('TUDO OK')

https://raw.githubusercontent.com/GabrielGabes/functions_gsa/main/fake_database.py
https://raw.githubusercontent.com/GabrielGabes/functions_gsa/main/funcoes_estatisticas.py
https://raw.githubusercontent.com/GabrielGabes/functions_gsa/main/ML_supervised_learning.py
TUDO OK


# Criando dataframe sintetico desbalanceado

In [32]:
from sklearn.datasets import make_classification

# Gerar dados sintéticos desbalanceados
x, y = make_classification(n_samples=7346, weights=[0.95, 0.05],  
                           n_features=10, # qtd total de features
                           n_informative=6, # features realmente uteis para o modelo
                           n_redundant=3, # features que diz praticamente oque uma outra já fim
                           n_classes=2, 
                           n_clusters_per_class=3,  # Subgrupos
                           class_sep=0.125,
                           random_state=3141592)

# Criar um DataFrame
df = pd.DataFrame(x, columns=[f'feature_{i}' for i in range(x.shape[1])])
df['target'] = y

# Ver a distribuição de classes
print(df['target'].value_counts(normalize=True))
print(df['target'].value_counts())

display(pd.concat([df.head(5), df.tail(5)]).T)
# Salvar para análise
#df.to_csv('imbalanced_dataset.csv', index=False)

target
0    0.945685
1    0.054315
Name: proportion, dtype: float64
target
0    6947
1     399
Name: count, dtype: int64


Unnamed: 0,0,1,2,3,4,7341,7342,7343,7344,7345
feature_0,-1.04368,0.024369,0.615455,-0.085438,-1.950661,-0.517,-2.692529,-1.076198,-0.173833,0.051514
feature_1,-1.990359,1.052022,-0.135408,-0.087211,-0.684208,0.062871,-2.081917,-0.133125,-0.621593,1.548708
feature_2,-1.616674,0.669025,5.117934,1.50805,-2.998303,0.752382,-0.332134,0.717059,0.298679,-1.236423
feature_3,0.514712,-1.276948,-3.700463,-1.036884,1.793797,0.061186,0.162031,0.102429,-2.520496,3.029865
feature_4,-0.294449,-0.6292,-3.071781,-0.348713,0.13864,0.112934,-4.167281,-3.733083,0.181639,2.231804
feature_5,-0.168076,-0.723463,-0.870718,1.042102,0.215236,1.553327,-1.498722,-2.430186,0.422883,0.434388
feature_6,-0.7372,1.141347,0.541557,1.173786,-0.266201,0.655819,0.044746,-0.410455,2.391613,-4.43613
feature_7,2.205685,-2.237312,0.96147,-0.504714,-1.60726,-1.007727,-2.162316,-2.942299,0.249525,1.747591
feature_8,-0.822867,2.075696,1.130719,-0.408578,-0.450379,-1.329416,0.797892,1.209964,-0.86704,-2.973186
feature_9,-2.207886,0.502049,-2.539167,0.28353,0.112774,1.045324,-3.580595,-2.11753,-0.597481,1.975224


# Tratando o DataFrame

In [33]:
grandom_state = 31415922

# Variavel Dependente
var_dep = 'target'
y = df[var_dep]
x = df.drop(var_dep, axis=1)

############################################################################################
# DUMMYRIZAÇÃO
colunas_categoricas = []
colunas_binarias = []
colunas_mais3_categorias = []
colunas_numericas = []

for coluna in x.columns:
    if df[coluna].dtype == 'O':
        colunas_categoricas.append(coluna)

        categorias = x[coluna].unique()
        if len(categorias) == 2:
            print('2 niveis:', coluna, '=>', categorias)
            colunas_binarias.append(coluna)
        else:
            print('3 niveis:', coluna, '=>', categorias)
            colunas_mais3_categorias.append(coluna)
    else:
        colunas_numericas.append(coluna)

############################################################################################
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder #transformando colunas com 2 categorias em 0 e 1

coluna = x.columns
one_hot = make_column_transformer((
    OneHotEncoder(drop='if_binary'), #caso a coluna tenha apenas 2 categorias 
    colunas_categoricas), #passando quais são essas colunas
    remainder = 'passthrough', sparse_threshold=0) #oque deve ser feito com as outras

#Aplicando transformação
x = one_hot.fit_transform(x)

#Os novos nomes das colunas #'onehotencoder=transformadas; 'remainder'=não transformadas
novos_nomes_colunas = one_hot.get_feature_names_out(coluna)

# Remover prefixo 'remainder__' das colunas que não foram transformadas
#novos_nomes_colunas = [nome.replace('remainder__', '') for nome in novos_nomes_colunas]

x = pd.DataFrame(x, columns = novos_nomes_colunas) #alterando de volta
x_columns = x.columns.tolist() 

############################################################################################
# Normalização (scaling entre 0 e 1) com MinMaxScaler ******************************
from sklearn.preprocessing import MinMaxScaler
normalizacao = MinMaxScaler()
#x = normalizacao.fit_transform(x)
# df['Close_normalizada'] = (df[coluna] - df[coluna].min()) / (df[coluna].max() - df[coluna].min())

# Padronização (média 0 e desvio padrão 1) com StandardScaler **********************
from sklearn.preprocessing import StandardScaler
padronizacao = StandardScaler()
#x = padronizacao.fit_transform(x)
# df['Close_padronizada'] = (df[coluna] - df[coluna].mean()) / df[coluna].std()

############################################################################################
# DEFININDO A VARIAVEL DEPENDENTE
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)

############################################################################################
#backups
x_inteiro = x
y_inteiro = y

# DIVIDINDO BASE EM TREINO E TESTE
from sklearn.model_selection import train_test_split
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, 
                                                    stratify = y, #para manter a proporção da Var Dep nos splits
                                                    random_state = grandom_state) #raiz da aleatoridade
# test_size = 0.25 #porcentagem que ira ser separado para testes

print(x_treino.shape, x_teste.shape)

print(y_treino.shape, y_teste.shape)
print(pd.concat([pd.Series(y_treino).value_counts(), pd.Series(y_treino).value_counts(normalize=True)*100], axis=1))
print(pd.concat([pd.Series(y_teste).value_counts(), pd.Series(y_teste).value_counts(normalize=True)*100], axis=1))

(5509, 10) (1837, 10)
(5509,) (1837,)
   count  proportion
0   5210   94.572518
1    299    5.427482
   count  proportion
0   1737   94.556342
1    100    5.443658


In [34]:
from xgboost import XGBClassifier

def teste_balanceamento(x_treino_balanceado, y_treino_balanceado):
    display(pd.concat([pd.Series(y_treino_balanceado).value_counts(), pd.Series(y_treino_balanceado).value_counts(normalize=True)*100], axis=1))

    modelo_smote = XGBClassifier(random_state=grandom_state)
    modelo_smote.fit(x_treino_balanceado, y_treino_balanceado)

    y_probs_positivas = modelo_smote.predict_proba(x_teste)[:,1]*100
    display(aval_modelo_corte_tabela(y_teste, y_probs_positivas))

teste_balanceamento(x_treino, y_treino)

Unnamed: 0,count,proportion
0,5210,94.572518
1,299,5.427482


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1699,38,68,32,0.942,0.457,0.32,0.978,0.376,0.353,0.457,0.962,0.022,0.68,0.543,0.038,0.298,0.353
1,20,1723,14,71,29,0.954,0.674,0.29,0.992,0.406,0.352,0.674,0.96,0.008,0.71,0.326,0.04,0.282,0.423
2,30,1725,12,74,26,0.953,0.684,0.26,0.993,0.377,0.321,0.684,0.959,0.007,0.74,0.316,0.041,0.253,0.403
3,40,1728,9,76,24,0.954,0.727,0.24,0.995,0.361,0.302,0.727,0.958,0.005,0.76,0.273,0.042,0.235,0.401
4,50,1732,5,79,21,0.954,0.808,0.21,0.997,0.333,0.272,0.808,0.956,0.003,0.79,0.192,0.044,0.207,0.398
5,60,1733,4,82,18,0.953,0.818,0.18,0.998,0.295,0.237,0.818,0.955,0.002,0.82,0.182,0.045,0.178,0.371
6,70,1734,3,86,14,0.952,0.824,0.14,0.998,0.239,0.188,0.824,0.953,0.002,0.86,0.176,0.047,0.138,0.328
7,80,1735,2,87,13,0.952,0.867,0.13,0.999,0.226,0.176,0.867,0.952,0.001,0.87,0.133,0.048,0.129,0.325
8,90,1736,1,89,11,0.951,0.917,0.11,0.999,0.196,0.151,0.917,0.951,0.001,0.89,0.083,0.049,0.109,0.308


# ====================================

# Balanceamento de Dados

# ====================================

# Oversampling - Upsampling

Abordagem avançada de oversampling usada para balancear conjuntos de dados desbalanceados, aumentando a representatividade das classes minoritárias por meio da `criação de dados sintéticos`, ao invés de simplesmente replicar os exemplos existentes.

(Exemplo SMOTE)

- **Diversificação:** Ao gerar novos exemplos, SMOTE introduz uma variedade maior no conjunto de dados, o que pode ajudar a evitar o overfitting que poderia ocorrer se simplesmente duplicássemos as amostras existentes.
- **Melhoria de Modelagem:** Com um balanceamento mais efetivo entre as classes, os modelos são capazes de aprender padrões mais generalizáveis, melhorando assim a precisão das previsões em dados não vistos.

SMOTE é amplamente utilizado em problemas de classificação onde o desequilíbrio de classes é significativo, como em detecção de fraude, diagnóstico médico e predição de falhas em equipamentos.

**Funcionamento**

1. **Identificação das Amostras:**
   SMOTE analisa as características das amostras minoritárias (classe sub-representada) e identifica seus vizinhos mais próximos.

2. **Síntese de Novos Exemplos:**
   Para cada amostra na classe minoritária, são criados novos exemplos sintéticos. Isso é feito selecionando um dos \( k \) vizinhos mais próximos (geralmente \( k=5 \)) e interpolando um novo ponto entre a amostra original e o vizinho selecionado.

3. **Adição ao Conjunto de Dados:**
   Os exemplos sintéticos gerados são então adicionados ao conjunto de dados, aumentando a proporção da classe minoritária.

**Considerações**

- **Espaço de Características:** SMOTE funciona bem quando as características são contínuas. Em dados categóricos, outras técnicas de oversampling, como o ADASYN (Adaptive Synthetic Sampling Approach), podem ser mais apropriadas.
- **Risco de Overfitting:** Apesar de introduzir diversidade, a criação de muitos exemplos sintéticos pode levar a um modelo excessivamente otimista em relação aos dados de treinamento. Deve-se ter cautela com o número de exemplos sintéticos gerados.
- **Não Adiciona Novas Informações:** Como as amostras são apenas replicadas, nenhuma informação nova é introduzida ao modelo, o que pode limitar a capacidade do modelo de aprender nuances mais complexas das classes.
- **Combinação com Downsampling:** Frequentemente, o upsampling é combinado com o downsampling da classe majoritária para criar um equilíbrio ainda mais efetivo e evitar o aumento excessivo do conjunto de dados.


In [35]:
from sklearn.utils import resample

# Separando por classe
x_inteiro_df = pd.DataFrame(x_inteiro)
y_inteiro_series = pd.Series(y_inteiro)
classe_maior = x_inteiro_df.iloc[sorted(y_inteiro_series[y_inteiro_series == 1].index)]
classe_menor = x_inteiro_df.iloc[sorted(y_inteiro_series[y_inteiro_series == 0].index)]

# Upsampling da classe minoritária
train_df_menor_upsampled = resample(classe_menor,
                                    replace=True,                # sample with replacement
                                    n_samples=len(classe_menor), # to match majority class
                                    random_state=123)            # reproducible results

# Combinando a classe majoritária com a classe minoritária upsampled
train_df_upsampled = pd.concat([classe_menor, train_df_menor_upsampled])

## SMOTE (Synthetic Minority Over-sampling Technique)
Gera novos exemplos sintéticos da classe minoritária ao invés de duplicar exemplos existentes. Funciona interpolando entre os exemplos minoritários e criando novos pontos ao longo das linhas que ligam os vizinhos mais próximos.

In [36]:
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_treino_balanceado, y_treino_balanceado = smote.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,5210,50.0
1,5210,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1426,311,39,61,0.809,0.164,0.61,0.821,0.258,0.332,0.164,0.973,0.179,0.39,0.836,0.027,0.431,0.243
1,20,1558,179,52,48,0.874,0.211,0.48,0.897,0.294,0.345,0.211,0.968,0.103,0.52,0.789,0.032,0.377,0.26
2,30,1606,131,58,42,0.897,0.243,0.42,0.925,0.308,0.343,0.243,0.965,0.075,0.58,0.757,0.035,0.345,0.268
3,40,1648,89,63,37,0.917,0.294,0.37,0.949,0.327,0.343,0.294,0.963,0.051,0.63,0.706,0.037,0.319,0.286
4,50,1672,65,66,34,0.929,0.343,0.34,0.963,0.342,0.341,0.343,0.962,0.037,0.66,0.657,0.038,0.303,0.304
5,60,1684,53,66,34,0.935,0.391,0.34,0.969,0.364,0.354,0.391,0.962,0.031,0.66,0.609,0.038,0.309,0.331
6,70,1699,38,70,30,0.941,0.441,0.3,0.978,0.357,0.333,0.441,0.96,0.022,0.7,0.559,0.04,0.278,0.334
7,80,1710,27,75,25,0.944,0.481,0.25,0.984,0.329,0.293,0.481,0.958,0.016,0.75,0.519,0.042,0.234,0.321
8,90,1724,13,82,18,0.948,0.581,0.18,0.993,0.275,0.229,0.581,0.955,0.007,0.82,0.419,0.045,0.173,0.304


## Variantes do SMOTE:

### Borderline-SMOTE
Aplica oversampling apenas nos exemplos da classe minoritária que estão perto da fronteira com a classe majoritária.

In [37]:
# Borderline-SMOTE
from imblearn.over_sampling import BorderlineSMOTE
borderline_smote = BorderlineSMOTE()
x_treino_balanceado, y_treino_balanceado = borderline_smote.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,5210,50.0
1,5210,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1549,188,45,55,0.873,0.226,0.55,0.892,0.321,0.382,0.226,0.972,0.108,0.45,0.774,0.028,0.442,0.296
1,20,1623,114,54,46,0.909,0.288,0.46,0.934,0.354,0.388,0.288,0.968,0.066,0.54,0.712,0.032,0.394,0.317
2,30,1664,73,61,39,0.927,0.348,0.39,0.958,0.368,0.376,0.348,0.965,0.042,0.61,0.652,0.035,0.348,0.33
3,40,1681,56,65,35,0.934,0.385,0.35,0.968,0.366,0.36,0.385,0.963,0.032,0.65,0.615,0.037,0.318,0.332
4,50,1693,44,67,33,0.94,0.429,0.33,0.975,0.373,0.355,0.429,0.962,0.025,0.67,0.571,0.038,0.305,0.345
5,60,1702,35,67,33,0.944,0.485,0.33,0.98,0.393,0.366,0.485,0.962,0.02,0.67,0.515,0.038,0.31,0.372
6,70,1714,23,68,32,0.95,0.582,0.32,0.987,0.413,0.371,0.582,0.962,0.013,0.68,0.418,0.038,0.307,0.408
7,80,1719,18,73,27,0.95,0.6,0.27,0.99,0.372,0.325,0.6,0.959,0.01,0.73,0.4,0.041,0.26,0.381
8,90,1728,9,77,23,0.953,0.719,0.23,0.995,0.348,0.291,0.719,0.957,0.005,0.77,0.281,0.043,0.225,0.39


### SMOTE-ENN (Edited Nearest Neighbours)
Combina SMOTE com um método de limpeza dos dados chamado ENN, que remove exemplos ruidosos após o oversampling.

In [38]:
# SMOTE-ENN
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN()
x_treino_balanceado, y_treino_balanceado = smote_enn.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
1,5100,54.53379
0,4252,45.46621


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1324,413,39,61,0.754,0.129,0.61,0.762,0.213,0.284,0.129,0.971,0.238,0.39,0.871,0.029,0.372,0.193
1,20,1466,271,47,53,0.827,0.164,0.53,0.844,0.25,0.314,0.164,0.969,0.156,0.47,0.836,0.031,0.374,0.223
2,30,1531,206,48,52,0.862,0.202,0.52,0.881,0.291,0.35,0.202,0.97,0.119,0.48,0.798,0.03,0.401,0.262
3,40,1585,152,51,49,0.889,0.244,0.49,0.912,0.326,0.374,0.244,0.969,0.088,0.51,0.756,0.031,0.402,0.293
4,50,1611,126,56,44,0.901,0.259,0.44,0.927,0.326,0.362,0.259,0.966,0.073,0.56,0.741,0.034,0.367,0.288
5,60,1634,103,57,43,0.913,0.295,0.43,0.941,0.35,0.377,0.295,0.966,0.059,0.57,0.705,0.034,0.371,0.311
6,70,1660,77,59,41,0.926,0.347,0.41,0.956,0.376,0.388,0.347,0.966,0.044,0.59,0.653,0.034,0.366,0.338
7,80,1675,62,62,38,0.932,0.38,0.38,0.964,0.38,0.38,0.38,0.964,0.036,0.62,0.62,0.036,0.344,0.344
8,90,1696,41,63,37,0.943,0.474,0.37,0.976,0.416,0.397,0.474,0.964,0.024,0.63,0.526,0.036,0.346,0.39


### ADASYN (Adaptive Synthetic Sampling)
Variante do SMOTE que ajusta o número de exemplos sintéticos gerados com base na densidade local, gerando mais exemplos para minorias que estão cercadas por muitas instâncias da classe majoritária.

In [39]:
# ADASYN
from imblearn.over_sampling import ADASYN
adasyn = ADASYN()
x_treino_balanceado, y_treino_balanceado = adasyn.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,5210,50.168512
1,5175,49.831488


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1397,340,42,58,0.792,0.146,0.58,0.804,0.233,0.303,0.146,0.971,0.196,0.42,0.854,0.029,0.384,0.212
1,20,1519,218,47,53,0.856,0.196,0.53,0.874,0.286,0.347,0.196,0.97,0.126,0.47,0.804,0.03,0.404,0.259
2,30,1588,149,58,42,0.887,0.22,0.42,0.914,0.289,0.328,0.22,0.965,0.086,0.58,0.78,0.035,0.334,0.248
3,40,1631,106,62,38,0.909,0.264,0.38,0.939,0.311,0.335,0.264,0.963,0.061,0.62,0.736,0.037,0.319,0.269
4,50,1659,78,64,36,0.923,0.316,0.36,0.955,0.336,0.345,0.316,0.963,0.045,0.64,0.684,0.037,0.315,0.296
5,60,1682,55,68,32,0.933,0.368,0.32,0.968,0.342,0.333,0.368,0.961,0.032,0.68,0.632,0.039,0.288,0.308
6,70,1696,41,68,32,0.941,0.438,0.32,0.976,0.37,0.349,0.438,0.961,0.024,0.68,0.562,0.039,0.296,0.344
7,80,1711,26,74,26,0.946,0.5,0.26,0.985,0.342,0.305,0.5,0.959,0.015,0.74,0.5,0.041,0.245,0.335
8,90,1727,10,80,20,0.951,0.667,0.2,0.994,0.308,0.255,0.667,0.956,0.006,0.8,0.333,0.044,0.194,0.348


### K-means SMOTE
Uma variante do SMOTE que usa agrupamento (k-means clustering) antes de aplicar o SMOTE, gerando exemplos sintéticos baseados nos clusters de dados minoritários.

In [40]:
# K-means SMOTE
from imblearn.over_sampling import KMeansSMOTE
# kmeans_smote = KMeansSMOTE(cluster_balance_threshold=0.1)
# x_treino_balanceado, y_treino_balanceado = kmeans_smote.fit_resample(x_treino, y_treino)
# teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

### Random Oversampling
Duplica aleatoriamente exemplos da classe minoritária até atingir um balanço desejado. Isso pode levar à overfitting, já que os mesmos exemplos são repetidos.

In [41]:
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(x_treino, y_treino)
teste_balanceamento(X_train_resampled, y_train_resampled)

Unnamed: 0,count,proportion
0,5210,50.0
1,5210,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1639,98,61,39,0.913,0.285,0.39,0.944,0.329,0.35,0.285,0.964,0.056,0.61,0.715,0.036,0.334,0.288
1,20,1691,46,67,33,0.938,0.418,0.33,0.974,0.369,0.353,0.418,0.962,0.026,0.67,0.582,0.038,0.304,0.339
2,30,1711,26,70,30,0.948,0.536,0.3,0.985,0.385,0.347,0.536,0.961,0.015,0.7,0.464,0.039,0.285,0.376
3,40,1720,17,73,27,0.951,0.614,0.27,0.99,0.375,0.326,0.614,0.959,0.01,0.73,0.386,0.041,0.26,0.386
4,50,1723,14,74,26,0.952,0.65,0.26,0.992,0.371,0.319,0.65,0.959,0.008,0.74,0.35,0.041,0.252,0.392
5,60,1726,11,77,23,0.952,0.676,0.23,0.994,0.343,0.289,0.676,0.957,0.006,0.77,0.324,0.043,0.224,0.376
6,70,1727,10,78,22,0.952,0.688,0.22,0.994,0.333,0.278,0.688,0.957,0.006,0.78,0.312,0.043,0.214,0.372
7,80,1732,5,80,20,0.954,0.8,0.2,0.997,0.32,0.26,0.8,0.956,0.003,0.8,0.2,0.044,0.197,0.386
8,90,1734,3,84,16,0.953,0.842,0.16,0.998,0.269,0.213,0.842,0.954,0.002,0.84,0.158,0.046,0.158,0.355


# Undersampling - Downsampling

Em resumo: Reduzem o número de exemplos da classe majoritária, removendo instâncias de forma controlada para equilibrar o conjunto.

Este método específico foca na `redução da classe majoritária`, mas com uma abordagem mais refinada que simplesmente remover amostras aleatoriamente. O NearMiss seleciona amostras da classe majoritária baseado em certos critérios de proximidade, com o objetivo de manter apenas aquelas que são mais representativas e/ou mais próximas das amostras da classe minoritária.

**Funcionamento (NearMiss)**

1. **Critérios de Seleção:**
   NearMiss implementa diferentes versões de seleção:
   - **NearMiss-1:** Seleciona amostras da classe majoritária com a menor distância média às três amostras mais próximas da classe minoritária.
   - **NearMiss-2:** Seleciona amostras da classe majoritária com a menor distância média às três amostras mais distantes da classe minoritária.
   - **NearMiss-3:** Um subconjunto da classe minoritária é selecionado primeiro, e então, para cada exemplo na classe minoritária, são retidas as \( n \) amostras mais próximas da classe majoritária.

2. **Redução da Classe Majoritária:**
   Amostras são selecionadas de acordo com o critério estabelecido até que o número de instâncias na classe majoritária seja reduzido suficientemente para igualar o da classe minoritária.

3. **Combinação de Dados:**
   As amostras da classe majoritária que atendem aos critérios são combinadas com as da classe minoritária para formar um novo conjunto de dados balanceado.

**Considerações**

- **Perda de Informação Crítica:** Apesar da intenção de manter amostras importantes, a remoção de grandes quantidades de dados pode resultar em perda de informações cruciais.
- **Escolha do Método:** A escolha entre NearMiss-1, NearMiss-2, e NearMiss-3 pode ter um impacto significativo nos resultados, exigindo testes para determinar qual método se adapta melhor ao problema específico.
- **Escolha de Amostras:** A seleção aleatória de amostras para remoção pode não ser a abordagem ideal; métodos mais sofisticados podem ser necessários para preservar a integridade da informação.
- **Combinação com Upsampling:** Muitas vezes, o downsampling é usado em conjunto com o upsampling para não apenas reduzir a classe majoritária, mas também aumentar a minoritária, alcançando um equilíbrio ideal.
- **Técnicas Avançadas:** Métodos como clustering ou análises de importância de instâncias podem ser utilizados para escolher quais amostras remover, assegurando que as mais representativas e informativas sejam mantidas.

## Random Undersampling
Remove exemplos da classe majoritária aleatoriamente. Embora simples, pode resultar na perda de informações importantes se não for usado com cuidado.

In [42]:
# Random Undersampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
x_treino_balanceado, y_treino_balanceado = rus.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,299,50.0
1,299,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,753,984,14,86,0.457,0.08,0.86,0.434,0.147,0.216,0.08,0.982,0.566,0.14,0.92,0.018,0.294,0.135
1,20,975,762,19,81,0.575,0.096,0.81,0.561,0.172,0.246,0.096,0.981,0.439,0.19,0.904,0.019,0.371,0.169
2,30,1096,641,23,77,0.639,0.107,0.77,0.631,0.188,0.265,0.107,0.979,0.369,0.23,0.893,0.021,0.401,0.186
3,40,1194,543,30,70,0.688,0.114,0.7,0.687,0.196,0.271,0.114,0.975,0.313,0.3,0.886,0.025,0.387,0.186
4,50,1278,459,37,63,0.73,0.121,0.63,0.736,0.203,0.274,0.121,0.972,0.264,0.37,0.879,0.028,0.366,0.184
5,60,1358,379,41,59,0.771,0.135,0.59,0.782,0.219,0.289,0.135,0.971,0.218,0.41,0.865,0.029,0.372,0.198
6,70,1442,295,46,54,0.814,0.155,0.54,0.83,0.241,0.306,0.155,0.969,0.17,0.46,0.845,0.031,0.37,0.214
7,80,1521,216,52,48,0.854,0.182,0.48,0.876,0.264,0.319,0.182,0.967,0.124,0.52,0.818,0.033,0.356,0.23
8,90,1620,117,59,41,0.904,0.259,0.41,0.933,0.318,0.348,0.259,0.965,0.067,0.59,0.741,0.035,0.343,0.277


## NearMiss
Seleciona exemplos da classe majoritária que estão mais próximos dos exemplos da classe minoritária, tentando manter exemplos representativos da classe majoritária.

> NearMiss-1: 
    Escolhe exemplos da classe majoritária com a menor distância média para os três exemplos mais próximos da classe minoritária.

    
> NearMiss-2:
    Escolhe exemplos da classe majoritária com a menor distância média para todos os exemplos da classe minoritária.

In [43]:
# NearMiss
from imblearn.under_sampling import NearMiss
near_miss = NearMiss(version=1)
x_treino_balanceado, y_treino_balanceado = near_miss.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,299,50.0
1,299,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,273,1464,10,90,0.198,0.058,0.9,0.157,0.109,0.164,0.058,0.965,0.843,0.1,0.942,0.035,0.057,0.036
1,20,366,1371,11,89,0.248,0.061,0.89,0.211,0.114,0.172,0.061,0.971,0.789,0.11,0.939,0.029,0.101,0.057
2,30,420,1317,12,88,0.277,0.063,0.88,0.242,0.117,0.175,0.063,0.972,0.758,0.12,0.937,0.028,0.122,0.065
3,40,464,1273,14,86,0.299,0.063,0.86,0.267,0.118,0.176,0.063,0.971,0.733,0.14,0.937,0.029,0.127,0.066
4,50,527,1210,17,83,0.332,0.064,0.83,0.303,0.119,0.178,0.064,0.969,0.697,0.17,0.936,0.031,0.133,0.066
5,60,577,1160,18,82,0.359,0.066,0.82,0.332,0.122,0.182,0.066,0.97,0.668,0.18,0.934,0.03,0.152,0.074
6,70,648,1089,23,77,0.395,0.066,0.77,0.373,0.122,0.18,0.066,0.966,0.627,0.23,0.934,0.034,0.143,0.067
7,80,726,1011,31,69,0.433,0.064,0.69,0.418,0.117,0.172,0.064,0.959,0.582,0.31,0.936,0.041,0.108,0.05
8,90,849,888,39,61,0.495,0.064,0.61,0.489,0.116,0.169,0.064,0.956,0.511,0.39,0.936,0.044,0.099,0.045


In [44]:
# NearMiss
from imblearn.under_sampling import NearMiss
near_miss = NearMiss(version=2)
x_treino_balanceado, y_treino_balanceado = near_miss.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,299,50.0
1,299,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,193,1544,10,90,0.154,0.055,0.9,0.111,0.104,0.157,0.055,0.951,0.889,0.1,0.945,0.049,0.011,0.008
1,20,228,1509,15,85,0.17,0.053,0.85,0.131,0.1,0.152,0.053,0.938,0.869,0.15,0.947,0.062,-0.019,-0.013
2,30,251,1486,15,85,0.183,0.054,0.85,0.145,0.102,0.154,0.054,0.944,0.855,0.15,0.946,0.056,-0.005,-0.004
3,40,279,1458,15,85,0.198,0.055,0.85,0.161,0.103,0.156,0.055,0.949,0.839,0.15,0.945,0.051,0.011,0.007
4,50,298,1439,18,82,0.207,0.054,0.82,0.172,0.101,0.153,0.054,0.943,0.828,0.18,0.946,0.057,-0.008,-0.005
5,60,318,1419,18,82,0.218,0.055,0.82,0.183,0.102,0.154,0.055,0.946,0.817,0.18,0.945,0.054,0.003,0.002
6,70,342,1395,20,80,0.23,0.054,0.8,0.197,0.102,0.153,0.054,0.945,0.803,0.2,0.946,0.055,-0.003,-0.002
7,80,369,1368,20,80,0.244,0.055,0.8,0.212,0.103,0.155,0.055,0.949,0.788,0.2,0.945,0.051,0.012,0.007
8,90,434,1303,24,76,0.278,0.055,0.76,0.25,0.103,0.154,0.055,0.948,0.75,0.24,0.945,0.052,0.01,0.005


In [45]:
# NearMiss
from imblearn.under_sampling import NearMiss
near_miss = NearMiss(version=3)
x_treino_balanceado, y_treino_balanceado = near_miss.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,299,50.0
1,299,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,561,1176,13,87,0.353,0.069,0.87,0.323,0.128,0.19,0.069,0.977,0.677,0.13,0.931,0.023,0.193,0.094
1,20,730,1007,25,75,0.438,0.069,0.75,0.42,0.127,0.186,0.069,0.967,0.58,0.25,0.931,0.033,0.17,0.079
2,30,854,883,32,68,0.502,0.072,0.68,0.492,0.129,0.188,0.072,0.964,0.508,0.32,0.928,0.036,0.172,0.078
3,40,944,793,36,64,0.549,0.075,0.64,0.543,0.134,0.192,0.075,0.963,0.457,0.36,0.925,0.037,0.183,0.083
4,50,1036,701,38,62,0.598,0.081,0.62,0.596,0.144,0.204,0.081,0.965,0.404,0.38,0.919,0.035,0.216,0.1
5,60,1126,611,44,56,0.643,0.084,0.56,0.648,0.146,0.204,0.084,0.962,0.352,0.44,0.916,0.038,0.208,0.098
6,70,1210,527,50,50,0.686,0.087,0.5,0.697,0.148,0.203,0.087,0.96,0.303,0.5,0.913,0.04,0.197,0.096
7,80,1306,431,59,41,0.733,0.087,0.41,0.752,0.143,0.191,0.087,0.957,0.248,0.59,0.913,0.043,0.162,0.084
8,90,1425,312,64,36,0.795,0.103,0.36,0.82,0.161,0.204,0.103,0.957,0.18,0.64,0.897,0.043,0.18,0.104


## Tomek Links
Identifica pares de exemplos (um da classe majoritária e um da minoritária) que são vizinhos mais próximos e pertencem a classes diferentes. Se esses pares forem encontrados, o exemplo da classe majoritária é removido, limpando a fronteira entre as classes.

In [46]:
# Tomek Links
from imblearn.under_sampling import TomekLinks
tomek_links = TomekLinks()
x_treino_balanceado, y_treino_balanceado = tomek_links.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,5146,94.508724
1,299,5.491276


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1700,37,63,37,0.946,0.5,0.37,0.979,0.425,0.402,0.5,0.964,0.021,0.63,0.5,0.036,0.349,0.402
1,20,1720,17,69,31,0.953,0.646,0.31,0.99,0.419,0.369,0.646,0.961,0.01,0.69,0.354,0.039,0.3,0.427
2,30,1727,10,73,27,0.955,0.73,0.27,0.994,0.394,0.335,0.73,0.959,0.006,0.73,0.27,0.041,0.264,0.427
3,40,1730,7,75,25,0.955,0.781,0.25,0.996,0.379,0.316,0.781,0.958,0.004,0.75,0.219,0.042,0.246,0.427
4,50,1731,6,79,21,0.954,0.778,0.21,0.997,0.331,0.271,0.778,0.956,0.003,0.79,0.222,0.044,0.207,0.389
5,60,1734,3,80,20,0.955,0.87,0.2,0.998,0.325,0.262,0.87,0.956,0.002,0.8,0.13,0.044,0.198,0.405
6,70,1734,3,83,17,0.953,0.85,0.17,0.998,0.283,0.226,0.85,0.954,0.002,0.83,0.15,0.046,0.168,0.368
7,80,1735,2,86,14,0.952,0.875,0.14,0.999,0.241,0.189,0.875,0.953,0.001,0.86,0.125,0.047,0.139,0.339
8,90,1737,0,89,11,0.952,1.0,0.11,1.0,0.198,0.151,1.0,0.951,0.0,0.89,0.0,0.049,0.11,0.323


## Cluster Centroids
Uma técnica de undersampling baseada em clusterização, onde os dados da classe majoritária são agrupados e os centróides desses clusters substituem os exemplos originais. Isso reduz o número de exemplos da classe majoritária sem perder muita representatividade

In [47]:
# Cluster Centroids
from imblearn.under_sampling import ClusterCentroids
cluster_centroids = ClusterCentroids()
x_treino_balanceado, y_treino_balanceado = cluster_centroids.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,299,50.0
1,299,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,730,1007,8,92,0.447,0.084,0.92,0.42,0.153,0.226,0.084,0.989,0.58,0.08,0.916,0.011,0.34,0.157
1,20,953,784,18,82,0.563,0.095,0.82,0.549,0.17,0.244,0.095,0.981,0.451,0.18,0.905,0.019,0.369,0.168
2,30,1115,622,25,75,0.648,0.108,0.75,0.642,0.188,0.264,0.108,0.978,0.358,0.25,0.892,0.022,0.392,0.183
3,40,1214,523,29,71,0.7,0.12,0.71,0.699,0.205,0.282,0.12,0.977,0.301,0.29,0.88,0.023,0.409,0.198
4,50,1307,430,35,65,0.747,0.131,0.65,0.752,0.218,0.293,0.131,0.974,0.248,0.35,0.869,0.026,0.402,0.206
5,60,1374,363,40,60,0.781,0.142,0.6,0.791,0.229,0.301,0.142,0.972,0.209,0.4,0.858,0.028,0.391,0.211
6,70,1450,287,45,55,0.819,0.161,0.55,0.835,0.249,0.315,0.161,0.97,0.165,0.45,0.839,0.03,0.385,0.224
7,80,1529,208,56,44,0.856,0.175,0.44,0.88,0.25,0.3,0.175,0.965,0.12,0.56,0.825,0.035,0.32,0.211
8,90,1610,127,61,39,0.898,0.235,0.39,0.927,0.293,0.324,0.235,0.963,0.073,0.61,0.765,0.037,0.317,0.251


# Técnicas Combinadas (Over/Under Sampling)

## SMOTE + Tomek Links
Primeiro aplica SMOTE para gerar exemplos sintéticos da classe minoritária e depois aplica Tomek Links para remover exemplos da classe majoritária que estão muito próximos da classe minoritária.

In [48]:
# SMOTE + Tomek Links
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek()
x_treino_balanceado, y_treino_balanceado = smote_tomek.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,5200,50.0
1,5200,50.0


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1438,299,41,59,0.815,0.165,0.59,0.828,0.258,0.329,0.165,0.972,0.172,0.41,0.835,0.028,0.418,0.239
1,20,1561,176,50,50,0.877,0.221,0.5,0.899,0.307,0.36,0.221,0.969,0.101,0.5,0.779,0.031,0.399,0.275
2,30,1612,125,58,42,0.9,0.251,0.42,0.928,0.315,0.348,0.251,0.965,0.072,0.58,0.749,0.035,0.348,0.275
3,40,1648,89,63,37,0.917,0.294,0.37,0.949,0.327,0.343,0.294,0.963,0.051,0.63,0.706,0.037,0.319,0.286
4,50,1675,62,66,34,0.93,0.354,0.34,0.964,0.347,0.344,0.354,0.962,0.036,0.66,0.646,0.038,0.304,0.31
5,60,1696,41,67,33,0.941,0.446,0.33,0.976,0.379,0.359,0.446,0.962,0.024,0.67,0.554,0.038,0.306,0.354
6,70,1706,31,69,31,0.946,0.5,0.31,0.982,0.383,0.351,0.5,0.961,0.018,0.69,0.5,0.039,0.292,0.367
7,80,1711,26,73,27,0.946,0.509,0.27,0.985,0.353,0.316,0.509,0.959,0.015,0.73,0.491,0.041,0.255,0.346
8,90,1724,13,79,21,0.95,0.618,0.21,0.993,0.313,0.264,0.618,0.956,0.007,0.79,0.382,0.044,0.203,0.341


## SMOTE + NearMiss
Combina SMOTE para a classe minoritária com NearMiss para a classe majoritária, criando um equilíbrio mais controlado entre as classes.

In [49]:
# SMOTE + NearMiss
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

# Aplicando SMOTE para oversampling
smote = SMOTE()
x_treino_oversampled, y_treino_oversampled = smote.fit_resample(x_treino, y_treino)

# Aplicando NearMiss para undersampling
near_miss = NearMiss(version=3)
x_treino_balanceado, y_treino_balanceado = near_miss.fit_resample(x_treino_oversampled, y_treino_oversampled)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,5210,63.808941
1,2955,36.191059


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1423,314,47,53,0.803,0.144,0.53,0.819,0.227,0.291,0.144,0.968,0.181,0.47,0.856,0.032,0.349,0.198
1,20,1560,177,55,45,0.874,0.203,0.45,0.898,0.28,0.327,0.203,0.966,0.102,0.55,0.797,0.034,0.348,0.242
2,30,1621,116,61,39,0.904,0.252,0.39,0.933,0.306,0.334,0.252,0.964,0.067,0.61,0.748,0.036,0.323,0.264
3,40,1658,79,63,37,0.923,0.319,0.37,0.955,0.343,0.353,0.319,0.963,0.045,0.63,0.681,0.037,0.325,0.303
4,50,1676,61,70,30,0.929,0.33,0.3,0.965,0.314,0.309,0.33,0.96,0.035,0.7,0.67,0.04,0.265,0.277
5,60,1685,52,74,26,0.931,0.333,0.26,0.97,0.292,0.279,0.333,0.958,0.03,0.74,0.667,0.042,0.23,0.259
6,70,1699,38,79,21,0.936,0.356,0.21,0.978,0.264,0.24,0.356,0.956,0.022,0.79,0.644,0.044,0.188,0.242
7,80,1720,17,82,18,0.946,0.514,0.18,0.99,0.267,0.225,0.514,0.954,0.01,0.82,0.486,0.046,0.17,0.282
8,90,1732,5,85,15,0.951,0.75,0.15,0.997,0.25,0.199,0.75,0.953,0.003,0.85,0.25,0.047,0.147,0.322


# Técnicas Baseadas em Algoritmos
Alguns algoritmos de machine learning possuem abordagens internas para lidar com dados desbalanceados.

In [50]:
from sklearn.linear_model import LogisticRegression

# Aplicando pesos de classe para balanceamento automático
clf = LogisticRegression(class_weight='balanced')
clf.fit(x_treino, y_treino)


## BalancedRandomForest
Uma técnica baseada em árvores de decisão que cria várias árvores com conjuntos de dados balanceados. Em cada árvore, realiza undersampling da classe majoritária de forma aleatória.

In [51]:
# Balanced Random Forest
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier()
brf.fit(x_treino, y_treino)


In [52]:
# Balanced Bagging Classifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
# bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10)
# bbc.fit(x_treino, y_treino)


# Técnicas de Amostragem Informatizada

## Edited Nearest Neighbors (ENN)
Remove exemplos da classe majoritária e da minoritária que são mal classificados pelos seus vizinhos mais próximos, ajudando a melhorar a separação entre as classes.

In [53]:
# Edited Nearest Neighbors (ENN)
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours()
x_treino_balanceado, y_treino_balanceado = enn.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,4838,94.179482
1,299,5.820518


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1674,63,61,39,0.932,0.382,0.39,0.964,0.386,0.388,0.382,0.965,0.036,0.61,0.618,0.035,0.354,0.35
1,20,1696,41,65,35,0.942,0.461,0.35,0.976,0.398,0.378,0.461,0.963,0.024,0.65,0.539,0.037,0.326,0.372
2,30,1717,20,66,34,0.953,0.63,0.34,0.988,0.442,0.396,0.63,0.963,0.012,0.66,0.37,0.037,0.328,0.441
3,40,1725,12,70,30,0.955,0.714,0.3,0.993,0.423,0.365,0.714,0.961,0.007,0.7,0.286,0.039,0.293,0.445
4,50,1727,10,72,28,0.955,0.737,0.28,0.994,0.406,0.346,0.737,0.96,0.006,0.72,0.263,0.04,0.274,0.437
5,60,1732,5,73,27,0.958,0.844,0.27,0.997,0.409,0.341,0.844,0.96,0.003,0.73,0.156,0.04,0.267,0.463
6,70,1732,5,75,25,0.956,0.833,0.25,0.997,0.385,0.319,0.833,0.958,0.003,0.75,0.167,0.042,0.247,0.442
7,80,1733,4,83,17,0.953,0.81,0.17,0.998,0.281,0.225,0.81,0.954,0.002,0.83,0.19,0.046,0.168,0.358
8,90,1735,2,85,15,0.953,0.882,0.15,0.999,0.256,0.201,0.882,0.953,0.001,0.85,0.118,0.047,0.149,0.353


## Repeated Edited Nearest Neighbors (RENN)
Aplica o processo de ENN repetidamente até que nenhum exemplo seja removido, limpando ainda mais as fronteiras entre as classes.

In [54]:
# Repeated Edited Nearest Neighbours (RENN)
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
renn = RepeatedEditedNearestNeighbours()
x_treino_balanceado, y_treino_balanceado = renn.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,4735,94.060389
1,299,5.939611


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1669,68,62,38,0.929,0.358,0.38,0.961,0.369,0.373,0.358,0.964,0.039,0.62,0.642,0.036,0.341,0.332
1,20,1698,39,64,36,0.944,0.48,0.36,0.978,0.411,0.39,0.48,0.964,0.022,0.64,0.52,0.036,0.338,0.387
2,30,1713,24,65,35,0.952,0.593,0.35,0.986,0.44,0.401,0.593,0.963,0.014,0.65,0.407,0.037,0.336,0.433
3,40,1723,14,69,31,0.955,0.689,0.31,0.992,0.428,0.373,0.689,0.961,0.008,0.69,0.311,0.039,0.302,0.443
4,50,1726,11,73,27,0.954,0.711,0.27,0.994,0.391,0.334,0.711,0.959,0.006,0.73,0.289,0.041,0.264,0.42
5,60,1728,9,77,23,0.953,0.719,0.23,0.995,0.348,0.291,0.719,0.957,0.005,0.77,0.281,0.043,0.225,0.39
6,70,1731,6,78,22,0.954,0.786,0.22,0.997,0.344,0.283,0.786,0.957,0.003,0.78,0.214,0.043,0.217,0.401
7,80,1732,5,80,20,0.954,0.8,0.2,0.997,0.32,0.26,0.8,0.956,0.003,0.8,0.2,0.044,0.197,0.386
8,90,1735,2,84,16,0.953,0.889,0.16,0.999,0.271,0.214,0.889,0.954,0.001,0.84,0.111,0.046,0.159,0.366


## One-Sided Selection (OSS)
Combina Tomek Links com ENN para limpar a classe majoritária, removendo exemplos ruidosos e fronteiriços.

In [55]:
# One-Sided Selection (OSS)
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection()
x_treino_balanceado, y_treino_balanceado = oss.fit_resample(x_treino, y_treino)
teste_balanceamento(x_treino_balanceado, y_treino_balanceado)

Unnamed: 0,count,proportion
0,5077,94.438244
1,299,5.561756


Unnamed: 0,threshold,tn,fp,fn,tp,acuracia,precisao,sensibilidade,especificidade,f1,fb_score,valor_pre_posi,valor_pre_neg,taxa_falsos_positivos,taxa_falsos_negativos,fdr,fo_r,indice_youden,coef_matthews
0,10,1688,49,63,37,0.939,0.43,0.37,0.972,0.398,0.387,0.43,0.964,0.028,0.63,0.57,0.036,0.342,0.367
1,20,1713,24,69,31,0.949,0.564,0.31,0.986,0.4,0.36,0.564,0.961,0.014,0.69,0.436,0.039,0.296,0.394
2,30,1722,15,73,27,0.952,0.643,0.27,0.991,0.38,0.329,0.643,0.959,0.009,0.73,0.357,0.041,0.261,0.397
3,40,1729,8,75,25,0.955,0.758,0.25,0.995,0.376,0.315,0.758,0.958,0.005,0.75,0.242,0.042,0.245,0.419
4,50,1732,5,77,23,0.955,0.821,0.23,0.997,0.359,0.295,0.821,0.957,0.003,0.77,0.179,0.043,0.227,0.421
5,60,1732,5,80,20,0.954,0.8,0.2,0.997,0.32,0.26,0.8,0.956,0.003,0.8,0.2,0.044,0.197,0.386
6,70,1735,2,81,19,0.955,0.905,0.19,0.999,0.314,0.251,0.905,0.955,0.001,0.81,0.095,0.045,0.189,0.403
7,80,1735,2,84,16,0.953,0.889,0.16,0.999,0.271,0.214,0.889,0.954,0.001,0.84,0.111,0.046,0.159,0.366
8,90,1736,1,87,13,0.952,0.929,0.13,0.999,0.228,0.177,0.929,0.952,0.001,0.87,0.071,0.048,0.129,0.338


In [56]:
# Tomek Links
# Edited Nearest Neighbors (ENN)
# Repeated Edited Nearest Neighbours (RENN)
# One-Sided Selection (OSS)

# FIM