# 01 - Classificação com Validação Cruzada Estratificada

Este notebook realiza o pré-processamento, separação de atributos/rótulos e avaliação de três algoritmos supervisionados:
- Árvore de Decisão (Random Forest)
- K-NN (K-Nearest Neighbors)
- MLP (Multi-Layer Perceptron)

A avaliação utiliza F1-score e matriz de confusão via validação cruzada estratificada k-fold, com `random_state` fixo para reprodutibilidade.

Observação: Ajuste o caminho do arquivo da base de dados na seção de carregamento.


In [1]:
import pandas as pd

# Configurações gerais
RANDOM_STATE = 42
N_JOBS = -1
N_SPLITS = 5  # k-fold
TEST_SIZE = 0.2

# Exibição de opções do pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)


In [15]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import (
    StratifiedKFold, 
    cross_validate, 
    cross_val_predict, 
    cross_val_score,
    train_test_split
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, make_scorer

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.utils import to_categorical


## Carregamento de dados

Substitua `data/arquivo.csv` pelo caminho real. O rótulo (target) deve estar em uma coluna, por exemplo `target`. Se a base tiver valores ausentes, faremos tratamento na etapa de pré-processamento.


## Pré-processamento

- Identificação de colunas numéricas e categóricas
- Tratamento de valores ausentes
- Padronização de numéricas e One-Hot em categóricas
- Separação de `X` e `y`


In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Carregando o arquivo JSON
sinal = pd.read_json('../data/Sinais/Sinais/Adição_AP_1.json')

# Extraindo todos os frames
frames = sinal['frames']

# Criando uma lista para armazenar os dados de todos os keypoints
dados_keypoints = []

# Iterando por todos os frames
for frame_idx, frame_data in enumerate(frames):
    # Iterando por todos os keypoints no frame
    for keypoint in frame_data['keypoints']:
        # Adicionando os dados do keypoint à lista
        dados_keypoints.append({
            'frame': frame_idx,
            'id': keypoint['id'],
            'x': keypoint['x'],
            'y': keypoint['y'],
            'z': keypoint['z'],
            'visibility': keypoint['visibility']
        })

# Convertendo para DataFrame
df_keypoints = pd.DataFrame(dados_keypoints)

# Calculando métricas para cada ID e cada atributo (x, y, z, visibility)
metricas = df_keypoints.groupby('id').agg({
    'x': ['mean', 'std', 'min', 'max'],
    'y': ['mean', 'std', 'min', 'max'],
    'z': ['mean', 'std', 'min', 'max'],
    'visibility': ['mean', 'std', 'min', 'max']
})

print("Métricas calculadas para cada ID:")
display(metricas)

Métricas calculadas para cada ID:


Unnamed: 0_level_0,x,x,x,x,y,y,y,y,z,z,z,z,visibility,visibility,visibility,visibility
Unnamed: 0_level_1,mean,std,min,max,mean,std,min,max,mean,std,min,max,mean,std,min,max
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,442.692308,6.140641,436,454,187.951049,11.869792,178,212,-1.271958,0.187734,-1.52,-0.89,1.0,0.0,1.0,1.0
1,467.713287,7.276842,461,481,156.468531,9.763978,147,176,-1.19979,0.178003,-1.43,-0.84,1.0,0.0,1.0,1.0
2,481.048951,6.061107,474,492,157.384615,10.097359,148,177,-1.20014,0.178278,-1.43,-0.84,1.0,0.0,1.0,1.0
3,491.58042,5.785378,484,502,158.566434,10.311049,149,179,-1.19986,0.178278,-1.43,-0.84,1.0,0.0,1.0,1.0
4,425.160839,7.259428,418,439,155.608392,8.369154,147,171,-1.197622,0.186966,-1.44,-0.82,1.0,0.0,1.0,1.0
5,411.412587,6.756665,405,424,156.055944,7.828061,147,170,-1.197972,0.187172,-1.44,-0.82,1.0,0.0,1.0,1.0
6,398.300699,7.045161,391,411,156.93007,7.223446,148,170,-1.198951,0.187432,-1.44,-0.82,1.0,0.0,1.0,1.0
7,509.797203,8.184113,502,524,176.216783,9.457198,166,194,-0.701678,0.153053,-0.89,-0.4,1.0,0.0,1.0,1.0
8,384.797203,4.798121,380,395,175.104895,6.60315,167,188,-0.681049,0.19014,-0.88,-0.31,1.0,0.0,1.0,1.0
9,471.244755,4.837088,465,479,226.846154,10.973276,219,249,-1.082098,0.177406,-1.31,-0.73,1.0,0.0,1.0,1.0


### Montando o dataframe

As partes com menor visibilidade e que no geral não são tão importantes para libras serão retiradas do df.

A visibilidade a partir do keypoint 25 (dos Joelhos para baixo) é muito baixa e também não é tão importante na língua de sinais.

In [16]:
sinais = pd.read_csv('../data/Sinais/sinais.csv')
print(sinais.shape)
sinais.head()

def extrair_metricas_do_arquivo(caminho_arquivo):
    # Carregando o arquivo JSON
    sinal = pd.read_json(caminho_arquivo)
    
    # Extraindo todos os frames
    frames = sinal['frames']
    
    # Criando uma lista para armazenar os dados de todos os keypoints
    dados_keypoints = []
    
    # Iterando por todos os frames
    for frame_idx, frame_data in enumerate(frames):
        # Iterando por todos os keypoints no frame
        for keypoint in frame_data['keypoints']:
            # Adicionando apenas os keypoints de 0 a 25
            if keypoint['id'] <= 24:
                dados_keypoints.append({
                    'frame': frame_idx,
                    'id': keypoint['id'],
                    'x': keypoint['x'],
                    'y': keypoint['y'],
                    'z': keypoint['z'],
                    'visibility': keypoint['visibility']
                })
    
    # Convertendo para DataFrame
    df_keypoints = pd.DataFrame(dados_keypoints)
    
    # Calculando métricas para cada ID e cada atributo (x, y, z, visibility)
    metricas = df_keypoints.groupby('id').agg({
        'x': ['mean', 'std', 'min', 'max'],
        'y': ['mean', 'std', 'min', 'max'],
        'z': ['mean', 'std', 'min', 'max'],
        'visibility': ['mean', 'std', 'min', 'max']
    })
    
    # Achatando o índice de colunas e renomeando
    metricas.columns = ['_'.join(col).strip() for col in metricas.columns.values]
    
    # Resetando o índice para transformar o ID em coluna
    metricas = metricas.reset_index()
    
    # Criando um dicionário com todas as métricas
    metricas_dict = {}
    for _, row in metricas.iterrows():
        id_keypoint = int(row['id'])  # Convertendo para inteiro
        for col in metricas.columns:
            if col != 'id':
                metricas_dict[f'id_{id_keypoint}_{col}'] = row[col]
    
    return metricas_dict

# Criando um DataFrame para armazenar os resultados
resultados = pd.DataFrame()

# Processando cada arquivo
for idx, row in sinais.iterrows():
    # Construindo o caminho completo para o arquivo JSON
    caminho_arquivo = f'../data/Sinais/Sinais/{row["file_name"]}'
    
    try:
        # Extraindo métricas do arquivo
        metricas_arquivo = extrair_metricas_do_arquivo(caminho_arquivo)
        
        # Criando um DataFrame temporário com os dados originais e as métricas
        temp_df = pd.DataFrame([{**row, **metricas_arquivo}])
        
        # Concatenando com o DataFrame de resultados
        resultados = pd.concat([resultados, temp_df], ignore_index=True)
        
        # Feedback para acompanhar o progresso
        if idx % 10 == 0:
            print(f"Processados {idx+1} arquivos de {len(sinais)}")
            
    except Exception as e:
        print(f"Erro ao processar o arquivo {row['file_name']}: {e}")


(2502, 7)
Processados 1 arquivos de 2502
Processados 11 arquivos de 2502
Processados 21 arquivos de 2502
Processados 31 arquivos de 2502
Processados 41 arquivos de 2502
Processados 51 arquivos de 2502
Processados 61 arquivos de 2502
Processados 71 arquivos de 2502
Processados 81 arquivos de 2502
Processados 91 arquivos de 2502
Processados 101 arquivos de 2502
Processados 111 arquivos de 2502
Processados 121 arquivos de 2502
Processados 131 arquivos de 2502
Processados 141 arquivos de 2502
Processados 151 arquivos de 2502
Processados 161 arquivos de 2502
Processados 171 arquivos de 2502
Processados 181 arquivos de 2502
Processados 191 arquivos de 2502
Processados 201 arquivos de 2502
Processados 211 arquivos de 2502
Processados 221 arquivos de 2502
Processados 231 arquivos de 2502
Processados 241 arquivos de 2502
Processados 251 arquivos de 2502
Processados 261 arquivos de 2502
Processados 271 arquivos de 2502
Processados 281 arquivos de 2502
Processados 291 arquivos de 2502
Processados

  sinal = pd.read_json(caminho_arquivo)


Processados 571 arquivos de 2502
Processados 581 arquivos de 2502
Processados 591 arquivos de 2502
Processados 601 arquivos de 2502
Processados 611 arquivos de 2502
Processados 621 arquivos de 2502
Processados 631 arquivos de 2502
Processados 641 arquivos de 2502
Processados 651 arquivos de 2502
Processados 661 arquivos de 2502
Processados 671 arquivos de 2502
Processados 681 arquivos de 2502
Processados 691 arquivos de 2502
Processados 701 arquivos de 2502
Processados 711 arquivos de 2502
Processados 721 arquivos de 2502
Processados 731 arquivos de 2502
Processados 741 arquivos de 2502
Processados 751 arquivos de 2502
Processados 761 arquivos de 2502
Processados 771 arquivos de 2502
Processados 781 arquivos de 2502
Processados 791 arquivos de 2502
Processados 801 arquivos de 2502
Processados 811 arquivos de 2502
Processados 821 arquivos de 2502
Processados 831 arquivos de 2502
Processados 841 arquivos de 2502
Processados 851 arquivos de 2502
Processados 861 arquivos de 2502
Processado

In [17]:
resultados.head()


Unnamed: 0,file_name,width,height,duration_sec,num_frames,sinal,interprete,id_0_x_mean,id_0_x_std,id_0_x_min,id_0_x_max,id_0_y_mean,id_0_y_std,id_0_y_min,id_0_y_max,id_0_z_mean,id_0_z_std,id_0_z_min,id_0_z_max,id_0_visibility_mean,id_0_visibility_std,id_0_visibility_min,id_0_visibility_max,id_1_x_mean,id_1_x_std,id_1_x_min,id_1_x_max,id_1_y_mean,id_1_y_std,id_1_y_min,id_1_y_max,id_1_z_mean,id_1_z_std,id_1_z_min,id_1_z_max,id_1_visibility_mean,id_1_visibility_std,id_1_visibility_min,id_1_visibility_max,id_2_x_mean,id_2_x_std,id_2_x_min,id_2_x_max,id_2_y_mean,id_2_y_std,id_2_y_min,id_2_y_max,id_2_z_mean,id_2_z_std,id_2_z_min,id_2_z_max,id_2_visibility_mean,id_2_visibility_std,id_2_visibility_min,id_2_visibility_max,id_3_x_mean,id_3_x_std,id_3_x_min,id_3_x_max,id_3_y_mean,id_3_y_std,id_3_y_min,id_3_y_max,id_3_z_mean,id_3_z_std,id_3_z_min,id_3_z_max,id_3_visibility_mean,id_3_visibility_std,id_3_visibility_min,id_3_visibility_max,id_4_x_mean,id_4_x_std,id_4_x_min,id_4_x_max,id_4_y_mean,id_4_y_std,id_4_y_min,id_4_y_max,id_4_z_mean,id_4_z_std,id_4_z_min,id_4_z_max,id_4_visibility_mean,id_4_visibility_std,id_4_visibility_min,id_4_visibility_max,id_5_x_mean,id_5_x_std,id_5_x_min,id_5_x_max,id_5_y_mean,id_5_y_std,id_5_y_min,id_5_y_max,id_5_z_mean,id_5_z_std,id_5_z_min,id_5_z_max,id_5_visibility_mean,id_5_visibility_std,id_5_visibility_min,id_5_visibility_max,id_6_x_mean,id_6_x_std,id_6_x_min,id_6_x_max,id_6_y_mean,id_6_y_std,id_6_y_min,id_6_y_max,id_6_z_mean,id_6_z_std,id_6_z_min,id_6_z_max,id_6_visibility_mean,id_6_visibility_std,id_6_visibility_min,id_6_visibility_max,id_7_x_mean,id_7_x_std,id_7_x_min,id_7_x_max,id_7_y_mean,id_7_y_std,id_7_y_min,id_7_y_max,id_7_z_mean,id_7_z_std,id_7_z_min,id_7_z_max,id_7_visibility_mean,id_7_visibility_std,id_7_visibility_min,id_7_visibility_max,id_8_x_mean,id_8_x_std,id_8_x_min,id_8_x_max,id_8_y_mean,id_8_y_std,id_8_y_min,id_8_y_max,id_8_z_mean,id_8_z_std,id_8_z_min,id_8_z_max,id_8_visibility_mean,id_8_visibility_std,id_8_visibility_min,id_8_visibility_max,id_9_x_mean,id_9_x_std,id_9_x_min,id_9_x_max,id_9_y_mean,id_9_y_std,id_9_y_min,id_9_y_max,id_9_z_mean,id_9_z_std,id_9_z_min,id_9_z_max,id_9_visibility_mean,id_9_visibility_std,id_9_visibility_min,id_9_visibility_max,id_10_x_mean,id_10_x_std,id_10_x_min,id_10_x_max,id_10_y_mean,id_10_y_std,id_10_y_min,id_10_y_max,id_10_z_mean,id_10_z_std,id_10_z_min,id_10_z_max,id_10_visibility_mean,id_10_visibility_std,id_10_visibility_min,id_10_visibility_max,id_11_x_mean,id_11_x_std,id_11_x_min,id_11_x_max,id_11_y_mean,id_11_y_std,id_11_y_min,id_11_y_max,id_11_z_mean,id_11_z_std,id_11_z_min,id_11_z_max,id_11_visibility_mean,id_11_visibility_std,id_11_visibility_min,id_11_visibility_max,id_12_x_mean,id_12_x_std,id_12_x_min,id_12_x_max,id_12_y_mean,id_12_y_std,id_12_y_min,id_12_y_max,id_12_z_mean,id_12_z_std,id_12_z_min,id_12_z_max,id_12_visibility_mean,id_12_visibility_std,id_12_visibility_min,id_12_visibility_max,id_13_x_mean,id_13_x_std,id_13_x_min,id_13_x_max,id_13_y_mean,id_13_y_std,id_13_y_min,id_13_y_max,id_13_z_mean,id_13_z_std,id_13_z_min,id_13_z_max,id_13_visibility_mean,id_13_visibility_std,id_13_visibility_min,id_13_visibility_max,id_14_x_mean,id_14_x_std,id_14_x_min,id_14_x_max,id_14_y_mean,id_14_y_std,id_14_y_min,id_14_y_max,id_14_z_mean,id_14_z_std,id_14_z_min,id_14_z_max,id_14_visibility_mean,id_14_visibility_std,id_14_visibility_min,id_14_visibility_max,id_15_x_mean,id_15_x_std,id_15_x_min,id_15_x_max,id_15_y_mean,id_15_y_std,id_15_y_min,id_15_y_max,id_15_z_mean,id_15_z_std,id_15_z_min,id_15_z_max,id_15_visibility_mean,id_15_visibility_std,id_15_visibility_min,id_15_visibility_max,id_16_x_mean,id_16_x_std,id_16_x_min,id_16_x_max,id_16_y_mean,id_16_y_std,id_16_y_min,id_16_y_max,id_16_z_mean,id_16_z_std,id_16_z_min,id_16_z_max,id_16_visibility_mean,id_16_visibility_std,id_16_visibility_min,id_16_visibility_max,id_17_x_mean,id_17_x_std,id_17_x_min,id_17_x_max,id_17_y_mean,id_17_y_std,id_17_y_min,id_17_y_max,id_17_z_mean,id_17_z_std,id_17_z_min,id_17_z_max,id_17_visibility_mean,id_17_visibility_std,id_17_visibility_min,id_17_visibility_max,id_18_x_mean,id_18_x_std,id_18_x_min,id_18_x_max,id_18_y_mean,id_18_y_std,id_18_y_min,id_18_y_max,id_18_z_mean,id_18_z_std,id_18_z_min,id_18_z_max,id_18_visibility_mean,id_18_visibility_std,id_18_visibility_min,id_18_visibility_max,id_19_x_mean,id_19_x_std,id_19_x_min,id_19_x_max,id_19_y_mean,id_19_y_std,id_19_y_min,id_19_y_max,id_19_z_mean,id_19_z_std,id_19_z_min,id_19_z_max,id_19_visibility_mean,id_19_visibility_std,id_19_visibility_min,id_19_visibility_max,id_20_x_mean,id_20_x_std,id_20_x_min,id_20_x_max,id_20_y_mean,id_20_y_std,id_20_y_min,id_20_y_max,id_20_z_mean,id_20_z_std,id_20_z_min,id_20_z_max,id_20_visibility_mean,id_20_visibility_std,id_20_visibility_min,id_20_visibility_max,id_21_x_mean,id_21_x_std,id_21_x_min,id_21_x_max,id_21_y_mean,id_21_y_std,id_21_y_min,id_21_y_max,id_21_z_mean,id_21_z_std,id_21_z_min,id_21_z_max,id_21_visibility_mean,id_21_visibility_std,id_21_visibility_min,id_21_visibility_max,id_22_x_mean,id_22_x_std,id_22_x_min,id_22_x_max,id_22_y_mean,id_22_y_std,id_22_y_min,id_22_y_max,id_22_z_mean,id_22_z_std,id_22_z_min,id_22_z_max,id_22_visibility_mean,id_22_visibility_std,id_22_visibility_min,id_22_visibility_max,id_23_x_mean,id_23_x_std,id_23_x_min,id_23_x_max,id_23_y_mean,id_23_y_std,id_23_y_min,id_23_y_max,id_23_z_mean,id_23_z_std,id_23_z_min,id_23_z_max,id_23_visibility_mean,id_23_visibility_std,id_23_visibility_min,id_23_visibility_max,id_24_x_mean,id_24_x_std,id_24_x_min,id_24_x_max,id_24_y_mean,id_24_y_std,id_24_y_min,id_24_y_max,id_24_z_mean,id_24_z_std,id_24_z_min,id_24_z_max,id_24_visibility_mean,id_24_visibility_std,id_24_visibility_min,id_24_visibility_max
0,Adição_AP_10.json,738,1008,4.533333,136,Adição,Alexson,419.308824,5.288022,410.0,428.0,190.963235,8.858901,181.0,207.0,-1.185,0.12779,-1.41,-0.94,1.0,0.0,1.0,1.0,444.764706,6.697966,435.0,455.0,160.588235,7.11646,153.0,173.0,-1.116324,0.117661,-1.34,-0.89,1.0,0.0,1.0,1.0,458.522059,6.054037,450.0,467.0,161.698529,7.374952,153.0,175.0,-1.116471,0.117867,-1.34,-0.89,1.0,0.0,1.0,1.0,469.551471,4.661616,463.0,477.0,163.029412,7.374572,154.0,177.0,-1.11625,0.117914,-1.34,-0.89,1.0,0.0,1.0,1.0,403.022059,6.082722,394.0,412.0,159.764706,5.945724,152.0,170.0,-1.109779,0.126932,-1.33,-0.87,1.0,0.0,1.0,1.0,389.816176,6.157443,380.0,399.0,160.279412,5.244667,153.0,169.0,-1.109853,0.126959,-1.33,-0.87,1.0,0.0,1.0,1.0,377.257353,5.572414,369.0,387.0,160.941176,4.848466,154.0,170.0,-1.110882,0.127166,-1.33,-0.87,1.0,0.0,1.0,1.0,488.867647,6.710753,481.0,498.0,180.838235,6.442016,170.0,194.0,-0.627721,0.096877,-0.86,-0.46,1.0,0.0,1.0,1.0,365.786765,3.995197,360.0,375.0,177.764706,3.966966,172.0,184.0,-0.587794,0.138659,-0.82,-0.34,1.0,0.0,1.0,1.0,447.154412,4.172003,439.0,455.0,229.095588,8.065361,220.0,243.0,-1.000294,0.118908,-1.23,-0.78,1.0,0.0,1.0,1.0,393.735294,5.613786,384.0,405.0,227.602941,8.322463,218.0,242.0,-0.990221,0.130818,-1.22,-0.75,1.0,0.0,1.0,1.0,600.845588,2.511835,597.0,607.0,381.169118,4.561603,372.0,386.0,-0.383235,0.036654,-0.57,-0.32,1.0,0.0,1.0,1.0,260.779412,5.778237,254.0,279.0,375.198529,5.37459,363.0,382.0,-0.27875,0.13665,-0.51,-0.02,1.0,0.0,1.0,1.0,656.051471,20.950142,616.0,678.0,643.316176,5.452423,637.0,655.0,-0.569191,0.269147,-1.04,-0.3,1.0,0.0,1.0,1.0,145.330882,65.398387,48.0,204.0,589.279412,63.158905,481.0,644.0,-0.554485,0.211509,-0.93,-0.33,1.0,0.0,1.0,1.0,481.242647,37.984109,424.0,515.0,688.735294,137.498134,455.0,802.0,-1.214265,0.342309,-1.93,-0.79,0.986985,0.009453,0.97,1.0,300.536765,64.089163,191.0,360.0,637.566176,188.854547,348.0,807.0,-1.309118,0.415668,-2.12,-0.85,0.982868,0.011149,0.97,1.0,433.102941,41.610234,369.0,470.0,722.911765,187.441664,407.0,872.0,-1.399632,0.340042,-2.12,-0.93,0.964191,0.021101,0.9,0.99,350.485294,58.612347,248.0,406.0,674.823529,227.9561,323.0,875.0,-1.481103,0.42513,-2.32,-1.0,0.951029,0.017139,0.92,0.98,421.889706,32.517727,369.0,454.0,691.985294,186.493252,380.0,841.0,-1.403382,0.277561,-2.02,-0.92,0.966397,0.019039,0.91,0.99,377.522059,58.173937,278.0,433.0,648.352941,215.872168,316.0,839.0,-1.485588,0.348762,-2.26,-1.07,0.953529,0.014065,0.93,0.98,433.073529,31.556799,383.0,463.0,675.933824,168.908838,394.0,814.0,-1.223162,0.318321,-1.9,-0.78,0.965809,0.01872,0.91,0.99,369.75,57.658991,271.0,426.0,639.227941,201.162583,332.0,818.0,-1.322279,0.394197,-2.1,-0.88,0.950882,0.016621,0.92,0.98,533.044118,2.321773,525.0,537.0,886.25,9.420034,871.0,901.0,-0.040221,0.015562,-0.08,-0.01,0.983088,0.008565,0.97,1.0,312.830882,5.048733,304.0,321.0,886.632353,6.753664,873.0,902.0,0.042941,0.016605,0.01,0.08,0.982721,0.008207,0.97,1.0
1,Adição_AP_1.json,774,1006,4.766667,143,Adição,Alexson,442.692308,6.140641,436.0,454.0,187.951049,11.869792,178.0,212.0,-1.271958,0.187734,-1.52,-0.89,1.0,0.0,1.0,1.0,467.713287,7.276842,461.0,481.0,156.468531,9.763978,147.0,176.0,-1.19979,0.178003,-1.43,-0.84,1.0,0.0,1.0,1.0,481.048951,6.061107,474.0,492.0,157.384615,10.097359,148.0,177.0,-1.20014,0.178278,-1.43,-0.84,1.0,0.0,1.0,1.0,491.58042,5.785378,484.0,502.0,158.566434,10.311049,149.0,179.0,-1.19986,0.178278,-1.43,-0.84,1.0,0.0,1.0,1.0,425.160839,7.259428,418.0,439.0,155.608392,8.369154,147.0,171.0,-1.197622,0.186966,-1.44,-0.82,1.0,0.0,1.0,1.0,411.412587,6.756665,405.0,424.0,156.055944,7.828061,147.0,170.0,-1.197972,0.187172,-1.44,-0.82,1.0,0.0,1.0,1.0,398.300699,7.045161,391.0,411.0,156.93007,7.223446,148.0,170.0,-1.198951,0.187432,-1.44,-0.82,1.0,0.0,1.0,1.0,509.797203,8.184113,502.0,524.0,176.216783,9.457198,166.0,194.0,-0.701678,0.153053,-0.89,-0.4,1.0,0.0,1.0,1.0,384.797203,4.798121,380.0,395.0,175.104895,6.60315,167.0,188.0,-0.681049,0.19014,-0.88,-0.31,1.0,0.0,1.0,1.0,471.244755,4.837088,465.0,479.0,226.846154,10.973276,219.0,249.0,-1.082098,0.177406,-1.31,-0.73,1.0,0.0,1.0,1.0,417.741259,5.932381,410.0,427.0,226.34965,10.133099,218.0,247.0,-1.079021,0.187069,-1.31,-0.71,1.0,0.0,1.0,1.0,625.783217,2.368038,622.0,630.0,382.146853,3.339811,375.0,390.0,-0.42972,0.069919,-0.58,-0.3,1.0,0.0,1.0,1.0,278.272727,5.061845,271.0,290.0,384.13986,10.303562,360.0,397.0,-0.354685,0.160925,-0.54,-0.04,1.0,0.0,1.0,1.0,697.811189,15.83451,666.0,724.0,641.321678,5.070769,632.0,653.0,-0.563846,0.244736,-1.07,-0.31,0.998951,0.003075,0.99,1.0,164.559441,71.486099,46.0,225.0,606.454545,78.719034,462.0,664.0,-0.558322,0.205299,-1.01,-0.31,1.0,0.0,1.0,1.0,530.076923,39.007778,460.0,563.0,712.734266,134.443818,462.0,807.0,-1.189371,0.304604,-1.97,-0.79,0.986783,0.010319,0.96,1.0,335.258741,60.358986,224.0,386.0,672.517483,197.26492,332.0,826.0,-1.239161,0.374545,-2.19,-0.85,0.98028,0.008303,0.97,1.0,489.965035,43.443084,417.0,528.0,750.202797,179.724616,419.0,878.0,-1.37007,0.299113,-2.17,-0.93,0.96007,0.024394,0.89,0.99,392.335664,60.26572,280.0,447.0,713.671329,230.283193,313.0,892.0,-1.405594,0.382353,-2.41,-1.01,0.943357,0.016399,0.91,0.97,476.937063,35.930531,417.0,512.0,719.734266,177.897856,393.0,844.0,-1.381469,0.238005,-2.07,-0.91,0.962168,0.021629,0.9,0.99,420.111888,57.379648,315.0,469.0,684.237762,215.689101,309.0,855.0,-1.412797,0.303965,-2.34,-1.08,0.945245,0.013206,0.92,0.96,483.916084,33.788175,428.0,517.0,703.769231,163.030313,404.0,821.0,-1.201049,0.280635,-1.95,-0.78,0.961469,0.021162,0.9,0.99,409.328671,56.236645,305.0,459.0,673.846154,202.098125,325.0,835.0,-1.251888,0.35246,-2.18,-0.88,0.943147,0.0147,0.92,0.97,565.104895,2.827713,554.0,570.0,901.468531,6.165333,889.0,912.0,-0.045944,0.017532,-0.08,-0.02,0.978042,0.008329,0.97,0.99,340.678322,6.666715,328.0,351.0,906.251748,4.317453,889.0,915.0,0.048112,0.017237,0.02,0.08,0.977483,0.008262,0.97,0.99
2,Adição_AP_2.json,760,1002,4.433333,133,Adição,Alexson,439.075188,7.183339,431.0,451.0,188.413534,9.602026,179.0,210.0,-1.230677,0.195018,-1.45,-0.88,1.0,0.0,1.0,1.0,463.93985,8.59242,455.0,477.0,157.406015,8.264599,150.0,175.0,-1.162707,0.18665,-1.37,-0.83,1.0,0.0,1.0,1.0,476.992481,7.214775,469.0,488.0,158.661654,8.669773,151.0,176.0,-1.162782,0.187052,-1.37,-0.83,1.0,0.0,1.0,1.0,487.511278,6.568539,480.0,497.0,159.834586,8.88238,152.0,178.0,-1.162632,0.186937,-1.37,-0.83,1.0,0.0,1.0,1.0,422.24812,8.311553,414.0,435.0,155.503759,6.203157,149.0,170.0,-1.159173,0.194071,-1.38,-0.82,1.0,0.0,1.0,1.0,408.909774,7.632128,401.0,421.0,155.533835,5.378899,150.0,169.0,-1.159248,0.194113,-1.38,-0.82,1.0,0.0,1.0,1.0,396.270677,7.870281,389.0,408.0,156.045113,4.77345,151.0,169.0,-1.159474,0.194299,-1.38,-0.82,1.0,0.0,1.0,1.0,506.18797,8.710104,496.0,519.0,176.766917,8.169499,169.0,191.0,-0.675038,0.16425,-0.87,-0.4,1.0,0.0,1.0,1.0,382.969925,5.191689,376.0,394.0,173.285714,4.147967,168.0,184.0,-0.649925,0.194498,-0.87,-0.33,1.0,0.0,1.0,1.0,466.676692,5.356317,460.0,476.0,226.556391,9.216597,218.0,246.0,-1.04594,0.185799,-1.25,-0.72,1.0,0.0,1.0,1.0,414.548872,5.775426,405.0,427.0,225.398496,8.536854,216.0,245.0,-1.039925,0.194662,-1.26,-0.7,1.0,0.0,1.0,1.0,619.421053,2.477812,616.0,626.0,376.571429,5.871459,366.0,385.0,-0.418045,0.072294,-0.52,-0.27,1.0,0.0,1.0,1.0,277.488722,5.367716,270.0,294.0,376.353383,13.437958,349.0,391.0,-0.329023,0.160326,-0.51,-0.06,1.0,0.0,1.0,1.0,685.233083,22.262173,646.0,724.0,632.721805,3.477968,628.0,640.0,-0.586165,0.244282,-1.13,-0.36,1.0,0.0,1.0,1.0,152.827068,73.501032,49.0,218.0,584.488722,80.854491,458.0,656.0,-0.581203,0.21705,-1.0,-0.34,1.0,0.0,1.0,1.0,511.533835,41.671978,449.0,548.0,677.24812,143.655702,449.0,790.0,-1.208496,0.320448,-2.04,-0.81,0.987895,0.009379,0.97,1.0,313.631579,66.019565,213.0,374.0,628.864662,202.703724,321.0,802.0,-1.291579,0.400002,-2.04,-0.86,0.982932,0.010995,0.97,1.0,467.706767,47.29738,398.0,511.0,706.827068,189.612025,410.0,857.0,-1.388421,0.317115,-2.24,-0.95,0.96406,0.021604,0.92,0.99,369.541353,65.518967,268.0,434.0,663.518797,236.924985,305.0,862.0,-1.459023,0.408296,-2.24,-1.01,0.94812,0.018593,0.91,0.97,456.082707,39.59818,398.0,497.0,676.992481,187.907879,383.0,827.0,-1.386992,0.253894,-2.14,-0.94,0.966391,0.019359,0.92,0.99,396.954887,62.289508,302.0,458.0,638.097744,222.554279,301.0,821.0,-1.45609,0.32767,-2.17,-1.07,0.948647,0.014501,0.92,0.97,464.556391,36.725108,413.0,501.0,663.864662,172.504228,393.0,799.0,-1.216015,0.29715,-2.01,-0.8,0.96594,0.018991,0.92,0.99,387.443609,60.4574,294.0,447.0,629.744361,208.692526,314.0,803.0,-1.301429,0.377076,-2.03,-0.88,0.947519,0.018439,0.92,0.97,557.924812,3.763107,545.0,564.0,888.458647,5.138257,881.0,901.0,-0.039925,0.010335,-0.06,-0.01,0.978872,0.009821,0.96,0.99,337.93985,6.027411,327.0,345.0,892.383459,5.1709,885.0,902.0,0.04406,0.010375,0.02,0.07,0.978195,0.010063,0.96,0.99
3,Adição_AP_3.json,762,1000,4.933333,148,Adição,Alexson,440.195946,6.152991,430.0,452.0,188.554054,9.421559,180.0,205.0,-1.263378,0.194847,-1.48,-0.93,1.0,0.0,1.0,1.0,465.5,7.175639,454.0,478.0,156.648649,8.288887,149.0,170.0,-1.193446,0.187648,-1.4,-0.88,1.0,0.0,1.0,1.0,479.668919,6.452747,469.0,490.0,157.385135,8.494515,149.0,172.0,-1.193716,0.188016,-1.4,-0.88,1.0,0.0,1.0,1.0,490.621622,5.031067,482.0,498.0,158.567568,8.283384,150.0,172.0,-1.193446,0.187884,-1.4,-0.88,1.0,0.0,1.0,1.0,423.533784,6.892188,412.0,435.0,156.094595,7.638372,148.0,169.0,-1.188581,0.196483,-1.41,-0.86,1.0,0.0,1.0,1.0,410.216216,6.818882,398.0,422.0,156.493243,7.216286,149.0,168.0,-1.188919,0.196943,-1.41,-0.86,1.0,0.0,1.0,1.0,397.587838,6.619831,387.0,410.0,157.439189,6.698793,151.0,168.0,-1.189865,0.196901,-1.41,-0.86,1.0,0.0,1.0,1.0,508.871622,5.811413,500.0,517.0,175.716216,7.037355,169.0,188.0,-0.698243,0.164841,-0.87,-0.43,1.0,0.0,1.0,1.0,385.945946,4.978571,378.0,396.0,174.527027,5.36231,168.0,183.0,-0.667568,0.202899,-0.88,-0.34,1.0,0.0,1.0,1.0,468.594595,4.847101,460.0,478.0,226.797297,9.167348,218.0,243.0,-1.075405,0.184945,-1.28,-0.77,1.0,0.0,1.0,1.0,415.804054,6.483531,404.0,429.0,225.925676,9.029122,217.0,242.0,-1.067838,0.195739,-1.28,-0.74,1.0,0.0,1.0,1.0,624.716216,2.013696,623.0,632.0,376.912162,3.524316,372.0,383.0,-0.436689,0.067395,-0.53,-0.27,1.0,0.0,1.0,1.0,281.452703,6.57281,273.0,300.0,380.682432,6.556958,368.0,389.0,-0.331284,0.179012,-0.52,-0.06,1.0,0.0,1.0,1.0,685.195946,20.924113,646.0,708.0,636.432432,3.164777,632.0,645.0,-0.595946,0.246813,-1.02,-0.36,0.998649,0.00343,0.99,1.0,163.0,76.298198,46.0,229.0,596.844595,74.202216,470.0,656.0,-0.555203,0.187535,-0.86,-0.35,0.999392,0.002398,0.99,1.0,513.054054,44.122862,445.0,552.0,689.831081,132.287534,470.0,789.0,-1.231892,0.328943,-1.94,-0.79,0.98527,0.011806,0.96,1.0,323.148649,66.046403,215.0,381.0,646.810811,192.45119,354.0,804.0,-1.275541,0.389249,-2.05,-0.86,0.980878,0.013347,0.95,1.0,469.405405,51.233471,394.0,516.0,721.439189,176.693371,431.0,856.0,-1.414054,0.326252,-2.14,-0.93,0.95973,0.026469,0.88,0.99,381.364865,64.583875,278.0,436.0,681.439189,227.192304,336.0,864.0,-1.443243,0.398665,-2.25,-1.01,0.943514,0.025445,0.87,0.98,458.398649,44.275689,390.0,502.0,690.236486,175.657535,404.0,828.0,-1.419392,0.264542,-2.07,-0.91,0.962365,0.023829,0.89,0.99,407.162162,62.315093,308.0,460.0,653.337838,212.304446,333.0,824.0,-1.445811,0.315733,-2.16,-1.08,0.946081,0.020392,0.88,0.97,466.932432,41.080254,405.0,507.0,676.486486,160.659376,415.0,799.0,-1.241892,0.305891,-1.92,-0.78,0.960405,0.024823,0.89,0.99,396.952703,60.35556,297.0,450.0,644.959459,198.138304,351.0,805.0,-1.286419,0.366062,-2.03,-0.89,0.942635,0.024726,0.87,0.98,564.060811,3.216087,557.0,570.0,892.662162,6.196942,883.0,902.0,-0.047635,0.008834,-0.07,-0.02,0.975608,0.012357,0.94,0.99,342.858108,6.059804,331.0,351.0,897.256757,3.840567,888.0,902.0,0.051419,0.009477,0.02,0.07,0.975068,0.013067,0.93,0.99
4,Adição_AP_4.json,764,1004,4.6,138,Adição,Alexson,439.0,6.014581,431.0,449.0,189.536232,9.858976,179.0,208.0,-1.267754,0.200474,-1.53,-0.91,1.0,0.0,1.0,1.0,464.231884,7.350743,456.0,475.0,157.557971,8.47669,149.0,173.0,-1.197754,0.193176,-1.45,-0.86,1.0,0.0,1.0,1.0,477.797101,6.008703,470.0,486.0,158.652174,8.921926,150.0,175.0,-1.198043,0.193639,-1.45,-0.86,1.0,0.0,1.0,1.0,488.471014,5.317911,482.0,496.0,159.869565,8.978341,151.0,176.0,-1.197681,0.193599,-1.45,-0.85,1.0,0.0,1.0,1.0,421.985507,6.856172,414.0,432.0,156.702899,6.888485,149.0,169.0,-1.194058,0.199355,-1.45,-0.84,1.0,0.0,1.0,1.0,408.485507,6.150468,401.0,418.0,157.26087,6.089724,149.0,168.0,-1.194638,0.199971,-1.46,-0.84,1.0,0.0,1.0,1.0,395.637681,6.178538,388.0,406.0,158.166667,5.358101,151.0,167.0,-1.19529,0.199884,-1.46,-0.85,1.0,0.0,1.0,1.0,506.884058,7.005808,499.0,518.0,177.630435,7.914761,170.0,192.0,-0.698986,0.174035,-0.92,-0.4,1.0,0.0,1.0,1.0,382.681159,3.59437,377.0,390.0,176.014493,4.491656,169.0,183.0,-0.673623,0.199835,-0.91,-0.35,1.0,0.0,1.0,1.0,467.57971,4.710311,461.0,476.0,227.536232,9.51001,217.0,245.0,-1.078261,0.192528,-1.32,-0.74,1.0,0.0,1.0,1.0,414.362319,5.391182,405.0,426.0,226.615942,9.064507,216.0,243.0,-1.072609,0.199515,-1.33,-0.73,1.0,0.0,1.0,1.0,622.326087,2.776762,618.0,630.0,378.862319,5.673851,370.0,388.0,-0.430797,0.089411,-0.56,-0.28,1.0,0.0,1.0,1.0,277.0,6.113286,269.0,293.0,382.282609,11.716224,359.0,393.0,-0.343188,0.165366,-0.52,-0.05,1.0,0.0,1.0,1.0,688.028986,19.474461,651.0,718.0,639.224638,4.27011,632.0,647.0,-0.605507,0.237506,-1.06,-0.38,1.0,0.0,1.0,1.0,161.673913,73.682705,48.0,225.0,598.652174,80.738078,460.0,665.0,-0.57942,0.219707,-0.95,-0.32,1.0,0.0,1.0,1.0,516.963768,37.598806,458.0,554.0,688.913043,143.450669,455.0,801.0,-1.255725,0.328975,-1.96,-0.88,0.987536,0.009107,0.97,1.0,326.601449,62.287131,220.0,378.0,644.26087,201.146397,333.0,807.0,-1.292464,0.408349,-2.14,-0.83,0.980725,0.009714,0.97,1.0,474.26087,41.673269,407.0,515.0,719.391304,189.648751,410.0,869.0,-1.441449,0.326413,-2.16,-1.02,0.962319,0.019864,0.94,0.99,382.26087,61.547715,273.0,435.0,680.927536,235.779516,315.0,868.0,-1.462319,0.417284,-2.36,-0.99,0.946087,0.014369,0.92,0.97,462.471014,34.717671,400.0,496.0,690.166667,188.299869,385.0,838.0,-1.446957,0.262424,-2.07,-1.01,0.964783,0.017889,0.94,0.99,409.42029,59.245969,305.0,460.0,653.862319,222.946728,310.0,833.0,-1.467971,0.334492,-2.29,-1.07,0.948913,0.010786,0.93,0.97,471.123188,32.360527,416.0,502.0,675.949275,172.899757,396.0,812.0,-1.265797,0.30525,-1.94,-0.87,0.963116,0.018356,0.94,0.99,399.637681,58.123948,296.0,450.0,645.282609,208.841084,326.0,813.0,-1.30413,0.383763,-2.13,-0.87,0.945725,0.013823,0.92,0.97,562.673913,2.696749,554.0,568.0,894.818841,3.930483,889.0,902.0,-0.04942,0.011947,-0.07,-0.02,0.977319,0.009244,0.96,0.99,340.23913,6.380347,330.0,350.0,900.137681,5.523925,891.0,910.0,0.052029,0.011018,0.02,0.08,0.977391,0.009537,0.96,0.99


### EDA

Procurando por valores nulos, outliers e outros problemas.

In [18]:
# Análise Exploratória dos Dados

# Verificando as dimensões do dataset
print(f"Dimensões do dataset: {resultados.shape}")

# Verificando os tipos de dados
print("\nTipos de dados:")
print(resultados.dtypes)

# Verificando valores nulos
print("\nValores nulos por coluna:")
print(resultados.isnull().sum())

# Estatísticas descritivas para colunas numéricas
print("\nEstatísticas descritivas para colunas numéricas:")
print(resultados.describe())

# Verificando a distribuição dos sinais (classes)
print("\nDistribuição dos sinais (classes):")
print(resultados['sinal'].value_counts())

# Verificando a distribuição dos intérpretes
print("\nDistribuição dos intérpretes:")
print(resultados['interprete'].value_counts())

# Identificando possíveis outliers em colunas numéricas
print("\nIdentificando outliers em algumas colunas numéricas:")
colunas_numericas = resultados.select_dtypes(include=['int64', 'float64']).columns 

for coluna in colunas_numericas:
    Q1 = resultados[coluna].quantile(0.25)
    Q3 = resultados[coluna].quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR
    outliers = resultados[(resultados[coluna] < limite_inferior) | (resultados[coluna] > limite_superior)]
    print(f"\nOutliers na coluna {coluna}: {len(outliers)} ({len(outliers)/len(resultados)*100:.2f}%)")

# Criando uma cópia do dataframe para trabalhar com a classificação
df = resultados.copy()

# Criando uma coluna target numérica a partir da coluna 'sinal'
le = LabelEncoder()
df['target'] = le.fit_transform(df['sinal'])

# Mapeamento das classes
print("\nMapeamento das classes (sinal -> target):")
for i, classe in enumerate(le.classes_):
    print(f"{classe} -> {i}")


Dimensões do dataset: (2501, 407)

Tipos de dados:
file_name                 object
width                      int64
height                     int64
duration_sec             float64
num_frames                 int64
sinal                     object
interprete                object
id_0_x_mean              float64
id_0_x_std               float64
id_0_x_min               float64
id_0_x_max               float64
id_0_y_mean              float64
id_0_y_std               float64
id_0_y_min               float64
id_0_y_max               float64
id_0_z_mean              float64
id_0_z_std               float64
id_0_z_min               float64
id_0_z_max               float64
id_0_visibility_mean     float64
id_0_visibility_std      float64
id_0_visibility_min      float64
id_0_visibility_max      float64
id_1_x_mean              float64
id_1_x_std               float64
id_1_x_min               float64
id_1_x_max               float64
id_1_y_mean              float64
id_1_y_std               

### Retirando elementos com baixa visibilidade

In [19]:
# Definindo um limiar para considerar a visibilidade como baixa
limiar_visibilidade = 0.5

# Criando uma lista para armazenar as linhas com visibilidade baixa
linhas_visibilidade_baixa = []

# Identificando todas as colunas relacionadas à visibilidade
colunas_visibilidade = [col for col in resultados.columns if 'visibility' in col]

# Iterando sobre cada linha do DataFrame
for idx, row in resultados.iterrows():
    # Verificando se alguma coluna de visibilidade média está abaixo do limiar
    colunas_visibilidade_media = [col for col in colunas_visibilidade if 'visibility_mean' in col]
    
    # Se alguma coluna de visibilidade média estiver abaixo do limiar, adiciona à lista
    if any(row[col] < limiar_visibilidade for col in colunas_visibilidade_media):
        linhas_visibilidade_baixa.append(idx)

# Exibindo as linhas com visibilidade baixa
print(f"Foram encontradas {len(linhas_visibilidade_baixa)} linhas com visibilidade média abaixo de {limiar_visibilidade}")

# Selecionando apenas as colunas de visibilidade e informações básicas
colunas_para_exibir = ['file_name', 'sinal', 'interprete'] + colunas_visibilidade

# Exibindo todas as linhas com visibilidade baixa, mas apenas as colunas relacionadas à visibilidade
print("\nLinhas com visibilidade baixa (apenas colunas de visibilidade):")

# Verificando quais sinais têm mais problemas de visibilidade
sinais_problematicos = resultados.iloc[linhas_visibilidade_baixa]['sinal'].value_counts()


# Removendo as linhas com visibilidade baixa do DataFrame
resultados = resultados.drop(linhas_visibilidade_baixa)
print(f"\nDataFrame após remover linhas com visibilidade baixa: {resultados.shape}")


Foram encontradas 13 linhas com visibilidade média abaixo de 0.5

Linhas com visibilidade baixa (apenas colunas de visibilidade):

DataFrame após remover linhas com visibilidade baixa: (2488, 407)


### Remoção de colunas

Vou remover as colunas que nao tem relacao com a classificação

In [20]:
# Identificando todas as colunas relacionadas à visibilidade
colunas_visibilidade = [col for col in resultados.columns if 'visibility' in col]

# Exibindo as colunas que serão removidas
print(f"Removendo {len(colunas_visibilidade)} colunas relacionadas à visibilidade:")
print(colunas_visibilidade)

# Removendo as colunas de visibilidade do DataFrame
resultados = resultados.drop(columns=colunas_visibilidade)
print(f"\nDimensões do DataFrame após remover colunas de visibilidade: {resultados.shape}")

# Removendo colunas que não são relevantes para a classificação
colunas_para_remover = ['file_name', 'duration_sec', 'num_frames', 'width', 'height']
resultados = resultados.drop(columns=colunas_para_remover)
print(f"Dimensões do DataFrame após remover colunas irrelevantes: {resultados.shape}")


Removendo 100 colunas relacionadas à visibilidade:
['id_0_visibility_mean', 'id_0_visibility_std', 'id_0_visibility_min', 'id_0_visibility_max', 'id_1_visibility_mean', 'id_1_visibility_std', 'id_1_visibility_min', 'id_1_visibility_max', 'id_2_visibility_mean', 'id_2_visibility_std', 'id_2_visibility_min', 'id_2_visibility_max', 'id_3_visibility_mean', 'id_3_visibility_std', 'id_3_visibility_min', 'id_3_visibility_max', 'id_4_visibility_mean', 'id_4_visibility_std', 'id_4_visibility_min', 'id_4_visibility_max', 'id_5_visibility_mean', 'id_5_visibility_std', 'id_5_visibility_min', 'id_5_visibility_max', 'id_6_visibility_mean', 'id_6_visibility_std', 'id_6_visibility_min', 'id_6_visibility_max', 'id_7_visibility_mean', 'id_7_visibility_std', 'id_7_visibility_min', 'id_7_visibility_max', 'id_8_visibility_mean', 'id_8_visibility_std', 'id_8_visibility_min', 'id_8_visibility_max', 'id_9_visibility_mean', 'id_9_visibility_std', 'id_9_visibility_min', 'id_9_visibility_max', 'id_10_visibility_

In [34]:
resultados.head()

Unnamed: 0,sinal,interprete,id_0_x_mean,id_0_x_std,id_0_x_min,id_0_x_max,id_0_y_mean,id_0_y_std,id_0_y_min,id_0_y_max,id_0_z_mean,id_0_z_std,id_0_z_min,id_0_z_max,id_1_x_mean,id_1_x_std,id_1_x_min,id_1_x_max,id_1_y_mean,id_1_y_std,id_1_y_min,id_1_y_max,id_1_z_mean,id_1_z_std,id_1_z_min,id_1_z_max,id_2_x_mean,id_2_x_std,id_2_x_min,id_2_x_max,id_2_y_mean,id_2_y_std,id_2_y_min,id_2_y_max,id_2_z_mean,id_2_z_std,id_2_z_min,id_2_z_max,id_3_x_mean,id_3_x_std,id_3_x_min,id_3_x_max,id_3_y_mean,id_3_y_std,id_3_y_min,id_3_y_max,id_3_z_mean,id_3_z_std,id_3_z_min,id_3_z_max,id_4_x_mean,id_4_x_std,id_4_x_min,id_4_x_max,id_4_y_mean,id_4_y_std,id_4_y_min,id_4_y_max,id_4_z_mean,id_4_z_std,id_4_z_min,id_4_z_max,id_5_x_mean,id_5_x_std,id_5_x_min,id_5_x_max,id_5_y_mean,id_5_y_std,id_5_y_min,id_5_y_max,id_5_z_mean,id_5_z_std,id_5_z_min,id_5_z_max,id_6_x_mean,id_6_x_std,id_6_x_min,id_6_x_max,id_6_y_mean,id_6_y_std,id_6_y_min,id_6_y_max,id_6_z_mean,id_6_z_std,id_6_z_min,id_6_z_max,id_7_x_mean,id_7_x_std,id_7_x_min,id_7_x_max,id_7_y_mean,id_7_y_std,id_7_y_min,id_7_y_max,id_7_z_mean,id_7_z_std,id_7_z_min,id_7_z_max,id_8_x_mean,id_8_x_std,id_8_x_min,id_8_x_max,id_8_y_mean,id_8_y_std,id_8_y_min,id_8_y_max,id_8_z_mean,id_8_z_std,id_8_z_min,id_8_z_max,id_9_x_mean,id_9_x_std,id_9_x_min,id_9_x_max,id_9_y_mean,id_9_y_std,id_9_y_min,id_9_y_max,id_9_z_mean,id_9_z_std,id_9_z_min,id_9_z_max,id_10_x_mean,id_10_x_std,id_10_x_min,id_10_x_max,id_10_y_mean,id_10_y_std,id_10_y_min,id_10_y_max,id_10_z_mean,id_10_z_std,id_10_z_min,id_10_z_max,id_11_x_mean,id_11_x_std,id_11_x_min,id_11_x_max,id_11_y_mean,id_11_y_std,id_11_y_min,id_11_y_max,id_11_z_mean,id_11_z_std,id_11_z_min,id_11_z_max,id_12_x_mean,id_12_x_std,id_12_x_min,id_12_x_max,id_12_y_mean,id_12_y_std,id_12_y_min,id_12_y_max,id_12_z_mean,id_12_z_std,id_12_z_min,id_12_z_max,id_13_x_mean,id_13_x_std,id_13_x_min,id_13_x_max,id_13_y_mean,id_13_y_std,id_13_y_min,id_13_y_max,id_13_z_mean,id_13_z_std,id_13_z_min,id_13_z_max,id_14_x_mean,id_14_x_std,id_14_x_min,id_14_x_max,id_14_y_mean,id_14_y_std,id_14_y_min,id_14_y_max,id_14_z_mean,id_14_z_std,id_14_z_min,id_14_z_max,id_15_x_mean,id_15_x_std,id_15_x_min,id_15_x_max,id_15_y_mean,id_15_y_std,id_15_y_min,id_15_y_max,id_15_z_mean,id_15_z_std,id_15_z_min,id_15_z_max,id_16_x_mean,id_16_x_std,id_16_x_min,id_16_x_max,id_16_y_mean,id_16_y_std,id_16_y_min,id_16_y_max,id_16_z_mean,id_16_z_std,id_16_z_min,id_16_z_max,id_17_x_mean,id_17_x_std,id_17_x_min,id_17_x_max,id_17_y_mean,id_17_y_std,id_17_y_min,id_17_y_max,id_17_z_mean,id_17_z_std,id_17_z_min,id_17_z_max,id_18_x_mean,id_18_x_std,id_18_x_min,id_18_x_max,id_18_y_mean,id_18_y_std,id_18_y_min,id_18_y_max,id_18_z_mean,id_18_z_std,id_18_z_min,id_18_z_max,id_19_x_mean,id_19_x_std,id_19_x_min,id_19_x_max,id_19_y_mean,id_19_y_std,id_19_y_min,id_19_y_max,id_19_z_mean,id_19_z_std,id_19_z_min,id_19_z_max,id_20_x_mean,id_20_x_std,id_20_x_min,id_20_x_max,id_20_y_mean,id_20_y_std,id_20_y_min,id_20_y_max,id_20_z_mean,id_20_z_std,id_20_z_min,id_20_z_max,id_21_x_mean,id_21_x_std,id_21_x_min,id_21_x_max,id_21_y_mean,id_21_y_std,id_21_y_min,id_21_y_max,id_21_z_mean,id_21_z_std,id_21_z_min,id_21_z_max,id_22_x_mean,id_22_x_std,id_22_x_min,id_22_x_max,id_22_y_mean,id_22_y_std,id_22_y_min,id_22_y_max,id_22_z_mean,id_22_z_std,id_22_z_min,id_22_z_max,id_23_x_mean,id_23_x_std,id_23_x_min,id_23_x_max,id_23_y_mean,id_23_y_std,id_23_y_min,id_23_y_max,id_23_z_mean,id_23_z_std,id_23_z_min,id_23_z_max,id_24_x_mean,id_24_x_std,id_24_x_min,id_24_x_max,id_24_y_mean,id_24_y_std,id_24_y_min,id_24_y_max,id_24_z_mean,id_24_z_std,id_24_z_min,id_24_z_max
0,0,Alexson,419.308824,5.288022,410.0,428.0,190.963235,8.858901,181.0,207.0,-1.185,0.12779,-1.41,-0.94,444.764706,6.697966,435.0,455.0,160.588235,7.11646,153.0,173.0,-1.116324,0.117661,-1.34,-0.89,458.522059,6.054037,450.0,467.0,161.698529,7.374952,153.0,175.0,-1.116471,0.117867,-1.34,-0.89,469.551471,4.661616,463.0,477.0,163.029412,7.374572,154.0,177.0,-1.11625,0.117914,-1.34,-0.89,403.022059,6.082722,394.0,412.0,159.764706,5.945724,152.0,170.0,-1.109779,0.126932,-1.33,-0.87,389.816176,6.157443,380.0,399.0,160.279412,5.244667,153.0,169.0,-1.109853,0.126959,-1.33,-0.87,377.257353,5.572414,369.0,387.0,160.941176,4.848466,154.0,170.0,-1.110882,0.127166,-1.33,-0.87,488.867647,6.710753,481.0,498.0,180.838235,6.442016,170.0,194.0,-0.627721,0.096877,-0.86,-0.46,365.786765,3.995197,360.0,375.0,177.764706,3.966966,172.0,184.0,-0.587794,0.138659,-0.82,-0.34,447.154412,4.172003,439.0,455.0,229.095588,8.065361,220.0,243.0,-1.000294,0.118908,-1.23,-0.78,393.735294,5.613786,384.0,405.0,227.602941,8.322463,218.0,242.0,-0.990221,0.130818,-1.22,-0.75,600.845588,2.511835,597.0,607.0,381.169118,4.561603,372.0,386.0,-0.383235,0.036654,-0.57,-0.32,260.779412,5.778237,254.0,279.0,375.198529,5.37459,363.0,382.0,-0.27875,0.13665,-0.51,-0.02,656.051471,20.950142,616.0,678.0,643.316176,5.452423,637.0,655.0,-0.569191,0.269147,-1.04,-0.3,145.330882,65.398387,48.0,204.0,589.279412,63.158905,481.0,644.0,-0.554485,0.211509,-0.93,-0.33,481.242647,37.984109,424.0,515.0,688.735294,137.498134,455.0,802.0,-1.214265,0.342309,-1.93,-0.79,300.536765,64.089163,191.0,360.0,637.566176,188.854547,348.0,807.0,-1.309118,0.415668,-2.12,-0.85,433.102941,41.610234,369.0,470.0,722.911765,187.441664,407.0,872.0,-1.399632,0.340042,-2.12,-0.93,350.485294,58.612347,248.0,406.0,674.823529,227.9561,323.0,875.0,-1.481103,0.42513,-2.32,-1.0,421.889706,32.517727,369.0,454.0,691.985294,186.493252,380.0,841.0,-1.403382,0.277561,-2.02,-0.92,377.522059,58.173937,278.0,433.0,648.352941,215.872168,316.0,839.0,-1.485588,0.348762,-2.26,-1.07,433.073529,31.556799,383.0,463.0,675.933824,168.908838,394.0,814.0,-1.223162,0.318321,-1.9,-0.78,369.75,57.658991,271.0,426.0,639.227941,201.162583,332.0,818.0,-1.322279,0.394197,-2.1,-0.88,533.044118,2.321773,525.0,537.0,886.25,9.420034,871.0,901.0,-0.040221,0.015562,-0.08,-0.01,312.830882,5.048733,304.0,321.0,886.632353,6.753664,873.0,902.0,0.042941,0.016605,0.01,0.08
1,0,Alexson,442.692308,6.140641,436.0,454.0,187.951049,11.869792,178.0,212.0,-1.271958,0.187734,-1.52,-0.89,467.713287,7.276842,461.0,481.0,156.468531,9.763978,147.0,176.0,-1.19979,0.178003,-1.43,-0.84,481.048951,6.061107,474.0,492.0,157.384615,10.097359,148.0,177.0,-1.20014,0.178278,-1.43,-0.84,491.58042,5.785378,484.0,502.0,158.566434,10.311049,149.0,179.0,-1.19986,0.178278,-1.43,-0.84,425.160839,7.259428,418.0,439.0,155.608392,8.369154,147.0,171.0,-1.197622,0.186966,-1.44,-0.82,411.412587,6.756665,405.0,424.0,156.055944,7.828061,147.0,170.0,-1.197972,0.187172,-1.44,-0.82,398.300699,7.045161,391.0,411.0,156.93007,7.223446,148.0,170.0,-1.198951,0.187432,-1.44,-0.82,509.797203,8.184113,502.0,524.0,176.216783,9.457198,166.0,194.0,-0.701678,0.153053,-0.89,-0.4,384.797203,4.798121,380.0,395.0,175.104895,6.60315,167.0,188.0,-0.681049,0.19014,-0.88,-0.31,471.244755,4.837088,465.0,479.0,226.846154,10.973276,219.0,249.0,-1.082098,0.177406,-1.31,-0.73,417.741259,5.932381,410.0,427.0,226.34965,10.133099,218.0,247.0,-1.079021,0.187069,-1.31,-0.71,625.783217,2.368038,622.0,630.0,382.146853,3.339811,375.0,390.0,-0.42972,0.069919,-0.58,-0.3,278.272727,5.061845,271.0,290.0,384.13986,10.303562,360.0,397.0,-0.354685,0.160925,-0.54,-0.04,697.811189,15.83451,666.0,724.0,641.321678,5.070769,632.0,653.0,-0.563846,0.244736,-1.07,-0.31,164.559441,71.486099,46.0,225.0,606.454545,78.719034,462.0,664.0,-0.558322,0.205299,-1.01,-0.31,530.076923,39.007778,460.0,563.0,712.734266,134.443818,462.0,807.0,-1.189371,0.304604,-1.97,-0.79,335.258741,60.358986,224.0,386.0,672.517483,197.26492,332.0,826.0,-1.239161,0.374545,-2.19,-0.85,489.965035,43.443084,417.0,528.0,750.202797,179.724616,419.0,878.0,-1.37007,0.299113,-2.17,-0.93,392.335664,60.26572,280.0,447.0,713.671329,230.283193,313.0,892.0,-1.405594,0.382353,-2.41,-1.01,476.937063,35.930531,417.0,512.0,719.734266,177.897856,393.0,844.0,-1.381469,0.238005,-2.07,-0.91,420.111888,57.379648,315.0,469.0,684.237762,215.689101,309.0,855.0,-1.412797,0.303965,-2.34,-1.08,483.916084,33.788175,428.0,517.0,703.769231,163.030313,404.0,821.0,-1.201049,0.280635,-1.95,-0.78,409.328671,56.236645,305.0,459.0,673.846154,202.098125,325.0,835.0,-1.251888,0.35246,-2.18,-0.88,565.104895,2.827713,554.0,570.0,901.468531,6.165333,889.0,912.0,-0.045944,0.017532,-0.08,-0.02,340.678322,6.666715,328.0,351.0,906.251748,4.317453,889.0,915.0,0.048112,0.017237,0.02,0.08
2,0,Alexson,439.075188,7.183339,431.0,451.0,188.413534,9.602026,179.0,210.0,-1.230677,0.195018,-1.45,-0.88,463.93985,8.59242,455.0,477.0,157.406015,8.264599,150.0,175.0,-1.162707,0.18665,-1.37,-0.83,476.992481,7.214775,469.0,488.0,158.661654,8.669773,151.0,176.0,-1.162782,0.187052,-1.37,-0.83,487.511278,6.568539,480.0,497.0,159.834586,8.88238,152.0,178.0,-1.162632,0.186937,-1.37,-0.83,422.24812,8.311553,414.0,435.0,155.503759,6.203157,149.0,170.0,-1.159173,0.194071,-1.38,-0.82,408.909774,7.632128,401.0,421.0,155.533835,5.378899,150.0,169.0,-1.159248,0.194113,-1.38,-0.82,396.270677,7.870281,389.0,408.0,156.045113,4.77345,151.0,169.0,-1.159474,0.194299,-1.38,-0.82,506.18797,8.710104,496.0,519.0,176.766917,8.169499,169.0,191.0,-0.675038,0.16425,-0.87,-0.4,382.969925,5.191689,376.0,394.0,173.285714,4.147967,168.0,184.0,-0.649925,0.194498,-0.87,-0.33,466.676692,5.356317,460.0,476.0,226.556391,9.216597,218.0,246.0,-1.04594,0.185799,-1.25,-0.72,414.548872,5.775426,405.0,427.0,225.398496,8.536854,216.0,245.0,-1.039925,0.194662,-1.26,-0.7,619.421053,2.477812,616.0,626.0,376.571429,5.871459,366.0,385.0,-0.418045,0.072294,-0.52,-0.27,277.488722,5.367716,270.0,294.0,376.353383,13.437958,349.0,391.0,-0.329023,0.160326,-0.51,-0.06,685.233083,22.262173,646.0,724.0,632.721805,3.477968,628.0,640.0,-0.586165,0.244282,-1.13,-0.36,152.827068,73.501032,49.0,218.0,584.488722,80.854491,458.0,656.0,-0.581203,0.21705,-1.0,-0.34,511.533835,41.671978,449.0,548.0,677.24812,143.655702,449.0,790.0,-1.208496,0.320448,-2.04,-0.81,313.631579,66.019565,213.0,374.0,628.864662,202.703724,321.0,802.0,-1.291579,0.400002,-2.04,-0.86,467.706767,47.29738,398.0,511.0,706.827068,189.612025,410.0,857.0,-1.388421,0.317115,-2.24,-0.95,369.541353,65.518967,268.0,434.0,663.518797,236.924985,305.0,862.0,-1.459023,0.408296,-2.24,-1.01,456.082707,39.59818,398.0,497.0,676.992481,187.907879,383.0,827.0,-1.386992,0.253894,-2.14,-0.94,396.954887,62.289508,302.0,458.0,638.097744,222.554279,301.0,821.0,-1.45609,0.32767,-2.17,-1.07,464.556391,36.725108,413.0,501.0,663.864662,172.504228,393.0,799.0,-1.216015,0.29715,-2.01,-0.8,387.443609,60.4574,294.0,447.0,629.744361,208.692526,314.0,803.0,-1.301429,0.377076,-2.03,-0.88,557.924812,3.763107,545.0,564.0,888.458647,5.138257,881.0,901.0,-0.039925,0.010335,-0.06,-0.01,337.93985,6.027411,327.0,345.0,892.383459,5.1709,885.0,902.0,0.04406,0.010375,0.02,0.07
3,0,Alexson,440.195946,6.152991,430.0,452.0,188.554054,9.421559,180.0,205.0,-1.263378,0.194847,-1.48,-0.93,465.5,7.175639,454.0,478.0,156.648649,8.288887,149.0,170.0,-1.193446,0.187648,-1.4,-0.88,479.668919,6.452747,469.0,490.0,157.385135,8.494515,149.0,172.0,-1.193716,0.188016,-1.4,-0.88,490.621622,5.031067,482.0,498.0,158.567568,8.283384,150.0,172.0,-1.193446,0.187884,-1.4,-0.88,423.533784,6.892188,412.0,435.0,156.094595,7.638372,148.0,169.0,-1.188581,0.196483,-1.41,-0.86,410.216216,6.818882,398.0,422.0,156.493243,7.216286,149.0,168.0,-1.188919,0.196943,-1.41,-0.86,397.587838,6.619831,387.0,410.0,157.439189,6.698793,151.0,168.0,-1.189865,0.196901,-1.41,-0.86,508.871622,5.811413,500.0,517.0,175.716216,7.037355,169.0,188.0,-0.698243,0.164841,-0.87,-0.43,385.945946,4.978571,378.0,396.0,174.527027,5.36231,168.0,183.0,-0.667568,0.202899,-0.88,-0.34,468.594595,4.847101,460.0,478.0,226.797297,9.167348,218.0,243.0,-1.075405,0.184945,-1.28,-0.77,415.804054,6.483531,404.0,429.0,225.925676,9.029122,217.0,242.0,-1.067838,0.195739,-1.28,-0.74,624.716216,2.013696,623.0,632.0,376.912162,3.524316,372.0,383.0,-0.436689,0.067395,-0.53,-0.27,281.452703,6.57281,273.0,300.0,380.682432,6.556958,368.0,389.0,-0.331284,0.179012,-0.52,-0.06,685.195946,20.924113,646.0,708.0,636.432432,3.164777,632.0,645.0,-0.595946,0.246813,-1.02,-0.36,163.0,76.298198,46.0,229.0,596.844595,74.202216,470.0,656.0,-0.555203,0.187535,-0.86,-0.35,513.054054,44.122862,445.0,552.0,689.831081,132.287534,470.0,789.0,-1.231892,0.328943,-1.94,-0.79,323.148649,66.046403,215.0,381.0,646.810811,192.45119,354.0,804.0,-1.275541,0.389249,-2.05,-0.86,469.405405,51.233471,394.0,516.0,721.439189,176.693371,431.0,856.0,-1.414054,0.326252,-2.14,-0.93,381.364865,64.583875,278.0,436.0,681.439189,227.192304,336.0,864.0,-1.443243,0.398665,-2.25,-1.01,458.398649,44.275689,390.0,502.0,690.236486,175.657535,404.0,828.0,-1.419392,0.264542,-2.07,-0.91,407.162162,62.315093,308.0,460.0,653.337838,212.304446,333.0,824.0,-1.445811,0.315733,-2.16,-1.08,466.932432,41.080254,405.0,507.0,676.486486,160.659376,415.0,799.0,-1.241892,0.305891,-1.92,-0.78,396.952703,60.35556,297.0,450.0,644.959459,198.138304,351.0,805.0,-1.286419,0.366062,-2.03,-0.89,564.060811,3.216087,557.0,570.0,892.662162,6.196942,883.0,902.0,-0.047635,0.008834,-0.07,-0.02,342.858108,6.059804,331.0,351.0,897.256757,3.840567,888.0,902.0,0.051419,0.009477,0.02,0.07
4,0,Alexson,439.0,6.014581,431.0,449.0,189.536232,9.858976,179.0,208.0,-1.267754,0.200474,-1.53,-0.91,464.231884,7.350743,456.0,475.0,157.557971,8.47669,149.0,173.0,-1.197754,0.193176,-1.45,-0.86,477.797101,6.008703,470.0,486.0,158.652174,8.921926,150.0,175.0,-1.198043,0.193639,-1.45,-0.86,488.471014,5.317911,482.0,496.0,159.869565,8.978341,151.0,176.0,-1.197681,0.193599,-1.45,-0.85,421.985507,6.856172,414.0,432.0,156.702899,6.888485,149.0,169.0,-1.194058,0.199355,-1.45,-0.84,408.485507,6.150468,401.0,418.0,157.26087,6.089724,149.0,168.0,-1.194638,0.199971,-1.46,-0.84,395.637681,6.178538,388.0,406.0,158.166667,5.358101,151.0,167.0,-1.19529,0.199884,-1.46,-0.85,506.884058,7.005808,499.0,518.0,177.630435,7.914761,170.0,192.0,-0.698986,0.174035,-0.92,-0.4,382.681159,3.59437,377.0,390.0,176.014493,4.491656,169.0,183.0,-0.673623,0.199835,-0.91,-0.35,467.57971,4.710311,461.0,476.0,227.536232,9.51001,217.0,245.0,-1.078261,0.192528,-1.32,-0.74,414.362319,5.391182,405.0,426.0,226.615942,9.064507,216.0,243.0,-1.072609,0.199515,-1.33,-0.73,622.326087,2.776762,618.0,630.0,378.862319,5.673851,370.0,388.0,-0.430797,0.089411,-0.56,-0.28,277.0,6.113286,269.0,293.0,382.282609,11.716224,359.0,393.0,-0.343188,0.165366,-0.52,-0.05,688.028986,19.474461,651.0,718.0,639.224638,4.27011,632.0,647.0,-0.605507,0.237506,-1.06,-0.38,161.673913,73.682705,48.0,225.0,598.652174,80.738078,460.0,665.0,-0.57942,0.219707,-0.95,-0.32,516.963768,37.598806,458.0,554.0,688.913043,143.450669,455.0,801.0,-1.255725,0.328975,-1.96,-0.88,326.601449,62.287131,220.0,378.0,644.26087,201.146397,333.0,807.0,-1.292464,0.408349,-2.14,-0.83,474.26087,41.673269,407.0,515.0,719.391304,189.648751,410.0,869.0,-1.441449,0.326413,-2.16,-1.02,382.26087,61.547715,273.0,435.0,680.927536,235.779516,315.0,868.0,-1.462319,0.417284,-2.36,-0.99,462.471014,34.717671,400.0,496.0,690.166667,188.299869,385.0,838.0,-1.446957,0.262424,-2.07,-1.01,409.42029,59.245969,305.0,460.0,653.862319,222.946728,310.0,833.0,-1.467971,0.334492,-2.29,-1.07,471.123188,32.360527,416.0,502.0,675.949275,172.899757,396.0,812.0,-1.265797,0.30525,-1.94,-0.87,399.637681,58.123948,296.0,450.0,645.282609,208.841084,326.0,813.0,-1.30413,0.383763,-2.13,-0.87,562.673913,2.696749,554.0,568.0,894.818841,3.930483,889.0,902.0,-0.04942,0.011947,-0.07,-0.02,340.23913,6.380347,330.0,350.0,900.137681,5.523925,891.0,910.0,0.052029,0.011018,0.02,0.08


In [22]:
backup_resultados = resultados.copy()

In [31]:
resultados = backup_resultados.copy()

### Separar features e target
### Separando train e test sets

In [33]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd

# Definindo coluna alvo
TARGET_COL = 'sinal'

# Codificando o target como inteiro e substituindo no DataFrame
label_encoder = LabelEncoder()
resultados[TARGET_COL] = label_encoder.fit_transform(resultados[TARGET_COL])

# Agora sinal já é inteiro no DataFrame
y = resultados[TARGET_COL].copy()

# Features (removendo target e interprete)
feature_cols = [c for c in resultados.columns if c not in [TARGET_COL, 'interprete']]
X = resultados[feature_cols].copy()

# Definindo intérpretes que vão para o conjunto de teste
test_interpretes = ['Cecilia', 'Everton']

# Criando máscara de teste/treino
mask_test = resultados['interprete'].isin(test_interpretes)

X_train = X[~mask_test].copy()
y_train = y[~mask_test].copy()

X_test = X[mask_test].copy()
y_test = y[mask_test].copy()

print(f"Conjunto de treino: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Conjunto de teste: X_test={X_test.shape}, y_test={y_test.shape}")

# Criando preprocessador para features numéricas
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
    ],
    remainder='drop'
)

# Distribuição das classes
print("\nDistribuição das classes nos conjuntos:")
print("Treino:", pd.Series(y_train).value_counts().sort_index())
print("Teste:", pd.Series(y_test).value_counts().sort_index())


Conjunto de treino: X_train=(1988, 300), y_train=(1988,)
Conjunto de teste: X_test=(500, 300), y_test=(500,)

Distribuição das classes nos conjuntos:
Treino: sinal
0     80
1     81
2     80
3     80
4     80
5     80
6     80
7     80
8     80
9     80
10    80
11    80
12    80
13    80
14    80
15    80
16    80
17    80
18    80
19    80
20    70
21    80
22    77
23    80
24    80
Name: count, dtype: int64
Teste: sinal
0     21
1     20
2     20
3     20
4     20
5     20
6     19
7     20
8     20
9     20
10    20
11    20
12    20
13    20
14    20
15    20
16    20
17    20
18    20
19    20
20    20
21    20
22    20
23    20
24    20
Name: count, dtype: int64


### Implementando Random Forest

- Adição: 0
- Aluno: 1
- Antropologia: 2
- Apontador: 3
- Apostila: 4
- Biologia: 5
- Bolsa de Estudos: 6
- Capítulo: 7
- Classe: 8
- Coerência: 9
- Coesão: 10
- Colega: 11
- Conceito: 12
- Contexto: 13
- Curso: 14
- Dicionário: 15
- Disciplina: 16
- Ensinar: 17
- Escola: 18
- Estudar: 19
- Filosofia: 20
- Física: 21
- Geografia: 22
- História: 23
- Ângulo: 24

In [36]:


# Usar os conjuntos de treino já definidos anteriormente
# Criar pipeline: escalonamento + modelo
pipeline = Pipeline([
    ("scaler", StandardScaler()),   # pode ser ignorado no RF, mas útil se trocar para KNN/MLP
    ("rf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Definir o k-fold estratificado apenas para os dados de treino
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# F1-score macro
f1_macro = make_scorer(f1_score, average="macro")

# Avaliar com validação cruzada apenas nos dados de treino
scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring=f1_macro, n_jobs=-1)

print("F1-scores por fold:", scores)
print("Média do F1-score na validação cruzada:", np.mean(scores))

# Treinar o modelo final com todos os dados de treino
pipeline.fit(X_train, y_train)

# Avaliar no conjunto de teste separado
y_pred = pipeline.predict(X_test)
test_score = f1_score(y_test, y_pred, average="macro")
print(f"F1-score no conjunto de teste: {test_score:.4f}")

F1-scores por fold: [0.98499022 0.97742945 0.9722868  0.98489932 0.98991447]
Média do F1-score na validação cruzada: 0.981904051485652
F1-score no conjunto de teste: 0.1096


In [38]:
print("\n--- Targets ---")
print("y_train:", y_train.shape, y_train.dtype)
print("y_test:", y_test.shape, y_test.dtype)

print("\n--- Features Treino ---")
X_train.info()

print("\n--- Features Teste ---")
X_test.info()


--- Targets ---
y_train: (1988,) int64
y_test: (500,) int64

--- Features Treino ---
<class 'pandas.core.frame.DataFrame'>
Index: 1988 entries, 0 to 2500
Columns: 300 entries, id_0_x_mean to id_24_z_max
dtypes: float64(300)
memory usage: 4.6 MB

--- Features Teste ---
<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 501 to 1500
Columns: 300 entries, id_0_x_mean to id_24_z_max
dtypes: float64(300)
memory usage: 1.1 MB


### Implementando KNN

## Modelagem e Avaliação (CV Estratificada)

Treinaremos e avaliaremos:
- Random Forest
- KNN
- MLP

Usaremos F1 macro e exibiremos a matriz de confusão a partir de `cross_val_predict` para cada modelo.


In [None]:
def avaliar_modelo(nome, estimator):
    pipe = Pipeline(steps=[('preprocess', preprocess), ('model', estimator)])
    # cross_validate para f1 macro
    cv_results = cross_validate(
        pipe, X, y, cv=cv, scoring=scorer, n_jobs=N_JOBS, return_estimator=False, error_score='raise'
    )

    # cross_val_predict para matriz de confusão
    y_pred = cross_val_predict(pipe, X, y, cv=cv, n_jobs=N_JOBS, method='predict')

    f1_mean = np.mean(cv_results['test_f1_macro'])
    f1_std = np.std(cv_results['test_f1_macro'])

    print(f"\nModelo: {nome}")
    print(f"F1-macro (média ± std): {f1_mean:.4f} ± {f1_std:.4f}")

    cm = confusion_matrix(y, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues', values_format='d')
    plt.title(f'Matriz de Confusão - {nome}')
    plt.show()

# Modelos com hiperparâmetros básicos e random_state fixo
rf = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=N_JOBS)
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=RANDOM_STATE)

for nome, est in [
    ('Random Forest', rf),
    ('KNN', knn),
    ('MLP', mlp)
]:
    avaliar_modelo(nome, est)
