In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# Load dataset
data = pd.read_csv("data/thyroid_cancer_risk_data.csv")
data = data.drop(["Patient_ID"], axis=1)
data["Diagnosis"] = data["Diagnosis"].map({"Benign": 0, "Malignant": 1})

X = data.drop(["Diagnosis", "Thyroid_Cancer_Risk"], axis=1)
y = data["Diagnosis"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Load the already fitted preprocessor
preprocessor = joblib.load("models/preprocessor.pkl")

# ✅ Create pipeline with model
model = RandomForestClassifier(n_estimators=100, random_state=42)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),  # already fitted
    ("classifier", model),
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# ✅ Save the full pipeline (preprocessor + model)
joblib.dump(pipeline, "models/model.pkl")
