In [None]:
import pandas as pd
import os
from google.colab import drive
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Pegando o 'features_df' criado no colab anterior

cada linha representa uma música com suas features e o rótulo de emoção (emotion_quadrant)

(1078, 19) ---> 1078 musicas com total de eventos, duração, pitch médio, velocidade, etc.

In [None]:
try:
    drive.mount('/content/drive')
except Exception as e:
    print(f"Drive already mounted or error: {e}")

Mounted at /content/drive


In [None]:
input_features_path = '/content/drive/MyDrive/INF 420 - IA/TRABALHO FINAL/dados_processados/emopia_features_with_labels.pkl'
if os.path.exists(input_features_path):
    features_df = pd.read_pickle(input_features_path) # Ou pd.read_parquet(input_features_path) se salvou como .parquet
    print(f"DataFrame de features carregado com sucesso")
else:
    print(f"ERRO: Arquivo de features não encontrado em: {input_features_path}")

DataFrame de features carregado com sucesso


In [None]:
features_df.shape

(1078, 19)

In [None]:
print(features_df.head())


         music_name  total_midi_events  duration  num_unique_tracks  \
0  Q1_0vLPYiPN7qY_0          -0.713426 -1.903888                0.0   
1  Q1_0vLPYiPN7qY_1           1.035827  0.006259                0.0   
2  Q1_0vLPYiPN7qY_2           0.614954  0.006259                0.0   
3  Q1_1Qc15G0ZHIg_1           0.207234 -0.127427                0.0   
4  Q1_1Qc15G0ZHIg_2           0.660987  0.006259                0.0   

   num_unique_channels  avg_note_pitch  std_note_pitch  min_note_pitch  \
0                  0.0        0.628681        0.341968        0.161962   
1                  0.0        0.405541       -1.459151        0.161962   
2                  0.0        0.560999        0.031951       -0.696903   
3                  0.0       -0.436291        0.253322       -0.206123   
4                  0.0       -0.825806        0.162303       -0.819598   

   max_note_pitch  pitch_range  avg_note_velocity  std_note_velocity  \
0        0.651053     0.337628          -0.095027       

# 1. Divisão dos Dados (Features e Rótulos):

Separar as features (as colunas que descrevem a música) da variável alvo (o rótulo de emoção)

X: DataFrame contendo apenas as colunas de features (tudo, exceto music_name e emotion_quadrant)

y: target --> coluna emotion_quadrant

In [None]:
X = features_df.drop(columns=['music_name', 'emotion_quadrant'])
y = features_df['emotion_quadrant']

print(f"\nFormato de X (features): {X.shape}")
print(f"Formato de y (rótulos): {y.shape}")
print("\nPrimeiras linhas de X:")
print(X.head())
print("\nPrimeiras linhas de y:")
print(y.head())


Formato de X (features): (1078, 17)
Formato de y (rótulos): (1078,)

Primeiras linhas de X:
   total_midi_events  duration  num_unique_tracks  num_unique_channels  \
0          -0.713426 -1.903888                0.0                  0.0   
1           1.035827  0.006259                0.0                  0.0   
2           0.614954  0.006259                0.0                  0.0   
3           0.207234 -0.127427                0.0                  0.0   
4           0.660987  0.006259                0.0                  0.0   

   avg_note_pitch  std_note_pitch  min_note_pitch  max_note_pitch  \
0        0.628681        0.341968        0.161962        0.651053   
1        0.405541       -1.459151        0.161962       -0.804533   
2        0.560999        0.031951       -0.696903        0.044559   
3       -0.436291        0.253322       -0.206123       -0.561935   
4       -0.825806        0.162303       -0.819598       -1.168429   

   pitch_range  avg_note_velocity  std_note_vel

# 2. Codificação da Variável Alvo (y):

coluna emotion_quadrant (Q1, Q2, Q3, Q4) está em string

sklearn.preprocessing.LabelEncoder para Q1 --> 0, Q2 --> 1, etc

guardar o LabelEncoder para poder decodificar as previsões do modelo de volta para os rótulos originais (Q1, Q2, Q3, Q4) mais tarde

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
#print(f"{label_encoder.classes_}")
#print(f"{y_encoded[:5]}")
print(f"Mapeamento: {list(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

Mapeamento: [('Q1', 0), ('Q2', 1), ('Q3', 2), ('Q4', 3)]


# Salvando as variaveis no drive para usa-las no 4_modelagem

In [None]:
drive.mount('/content/drive')

save_path = '/content/drive/MyDrive/INF 420 - IA/TRABALHO FINAL/dados_processados/'

with open(save_path + 'X.pkl', 'wb') as f:
    pickle.dump(X, f)

with open(save_path + 'y_encoded.pkl', 'wb') as f:
    pickle.dump(y_encoded, f)

with open(save_path + 'label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

'''with open(save_path + 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
'''

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


"with open(save_path + 'scaler.pkl', 'wb') as f:\n    pickle.dump(scaler, f)\n"