In [3]:
import pandas as pd
from enum import IntEnum
from sklearn.model_selection import train_test_split

# -----------------------------
# Step 1: Define the Enum
# -----------------------------
class MBTIType(IntEnum):
    ESTJ = 0
    ENTJ = 1
    ESFJ = 2
    ENFJ = 3
    ISTJ = 4
    ISFJ = 5
    INTJ = 6
    INFJ = 7
    ESTP = 8
    ESFP = 9
    ENTP = 10
    ENFP = 11
    ISTP = 12
    ISFP = 13
    INTP = 14
    INFP = 15


# -----------------------------
# Step 2: Read the CSV
# -----------------------------
file_path = "data/16P.csv"
df = pd.read_csv(file_path)

if "Personality" not in df.columns:
    raise ValueError("Column 'Personality' not found in CSV file")

# -----------------------------
# Step 3: Encode Personality
# -----------------------------
df["Personality"] = (
    df["Personality"]
    .astype(str)
    .str.strip()
    .str.upper()
)

mbti_mapping = {e.name: e.value for e in MBTIType}
df["Personality"] = df["Personality"].map(mbti_mapping)

if df["Personality"].isna().any():
    invalid = df[df["Personality"].isna()]
    raise ValueError(f"Invalid MBTI values found:\n{invalid}")


# -----------------------------
# Step 4: Train / Test Split (85 / 15)
# -----------------------------
X = df.drop(columns=["Personality"])
y = df["Personality"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    stratify=y  # keeps MBTI class distribution balanced
)

# -----------------------------
# Step 5: Save splits (optional)
# -----------------------------
X_train.to_csv("mnt/X_train.csv", index=False)
X_test.to_csv("mnt/X_test.csv", index=False)
y_train.to_csv("mnt/y_train.csv", index=False)
y_test.to_csv("mnt/y_test.csv", index=False)

print("Train-test split complete")
print(f"Train size: {len(X_train)}")
print(f"Test size : {len(X_test)}")


Train-test split complete
Train size: 50999
Test size : 9000
