In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Carrega o CSV com as features
df = pd.read_csv("features/features.csv")

# Divide em X (features) e y (rótulos)
X = df.drop(columns=["filename", "class_id"])
y = df["class_id"]
filenames = df["filename"]

# Primeiro, separa treino (80%) e o restante (20%)
X_train, X_temp, y_train, y_temp, f_train, f_temp = train_test_split(
    X, y, filenames, test_size=0.2, stratify=y, random_state=42
)

# Agora divide os 20% restantes em validação (10%) e teste (10%)
X_val, X_test, y_val, y_test, f_val, f_test = train_test_split(
    X_temp, y_temp, f_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Cria DataFrames finais com os nomes das colunas
columns = X.columns.tolist()
train_df = pd.DataFrame(X_train, columns=columns)
train_df.insert(0, "filename", f_train.values)
train_df.insert(1, "class_id", y_train.values)

val_df = pd.DataFrame(X_val, columns=columns)
val_df.insert(0, "filename", f_val.values)
val_df.insert(1, "class_id", y_val.values)

test_df = pd.DataFrame(X_test, columns=columns)
test_df.insert(0, "filename", f_test.values)
test_df.insert(1, "class_id", y_test.values)

# Cria a pasta de saída se necessário
import os
os.makedirs("splits", exist_ok=True)

# Salva os arquivos
train_df.to_csv("splits/train.csv", index=False)
val_df.to_csv("splits/val.csv", index=False)
test_df.to_csv("splits/test.csv", index=False)

print("✅ Dados divididos e salvos em:")
print("- splits/train.csv")
print("- splits/val.csv")
print("- splits/test.csv")


✅ Dados divididos e salvos em:
- splits/train.csv
- splits/val.csv
- splits/test.csv
