In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# -----------------------
# Load
# -----------------------
df = pd.read_excel("../data/Dry_Bean_Dataset.xlsx")

# -----------------------
# Build alphabetical encoding for Class
# -----------------------
classes = sorted(df["Class"].unique())
class_to_id = {c: i for i, c in enumerate(classes)}

# Encode
df["Class"] = df["Class"].map(class_to_id)

# -----------------------
# Split
# -----------------------
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["Class"]
)

# -----------------------
# Scale only feature columns
# -----------------------
feature_cols = [c for c in df.columns if c != "Class"]

scaler = StandardScaler()

train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
test_df[feature_cols]  = scaler.transform(test_df[feature_cols])

# -----------------------
# Save
# -----------------------
train_df.to_csv("../data/dry_bean_train.csv", index=False)
test_df.to_csv("../data/dry_bean_test.csv", index=False)

print("Saved train.csv and test.csv")

print("\nClass encoding (alphabetical):")
for k, v in class_to_id.items():
    print(f"{k} -> {v}")


Saved train.csv and test.csv

Class encoding (alphabetical):
BARBUNYA -> 0
BOMBAY -> 1
CALI -> 2
DERMASON -> 3
HOROZ -> 4
SEKER -> 5
SIRA -> 6
