In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Load dataset
data = pd.read_csv("data/thyroid_cancer_risk_data.csv")

# Drop unnecessary column
data = data.drop(["Patient_ID"], axis=1)

# Convert target to binary
data["Diagnosis"] = data["Diagnosis"].map({"Benign": 0, "Malignant": 1})

# Split features and target
X = data.drop(["Diagnosis", "Thyroid_Cancer_Risk"], axis=1)
y = data["Diagnosis"]

# Define feature types
numeric_features = ["Age", "TSH_Level", "T3_Level", "T4_Level", "Nodule_Size"]
categorical_features = ["Gender", "Country", "Ethnicity", "Family_History", 
                        "Radiation_Exposure", "Iodine_Deficiency", 
                        "Smoking", "Obesity", "Diabetes"]

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Fit the preprocessor before saving
preprocessor.fit(X_train)

# ✅ Save the fitted preprocessor
joblib.dump(preprocessor, "models/preprocessor.pkl")
