In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

# Load and clean dataset
df = pd.read_csv("autism_screening.csv")
df.columns = (
    df.columns.str.strip().str.lower()
    .str.replace(" ", "_")
    .str.replace("/", "_")
)

# Keep only desired columns
cols = [
    "ethnicity", "austim", "jundice", "relation", "age_desc",
    "qchat_10_score",
    "learning_disorder",
    "speech_delay_language_disorder",
    "global_developmental_delay_intellectual_disability",
    "social_behavioral_issues",
    "class_asd"
]
df = df[cols]

# Map yes/no to 1/0 for binary fields
binary_cols = [
    "class_asd", "austim", "jundice",
    "learning_disorder",
    "speech_delay_language_disorder",
    "global_developmental_delay_intellectual_disability",
    "social_behavioral_issues"
]
yn_map = {"yes": 1, "no": 0}
for col in binary_cols:
    df[col] = df[col].astype(str).str.lower().map(yn_map)

# Ensure numeric for Qchat score
df["qchat_10_score"] = pd.to_numeric(df["qchat_10_score"], errors="coerce")

# Drop rows with missing values
df.dropna(inplace=True)

# Features/target
X = df.drop("class_asd", axis=1)
y = df["class_asd"]

# Preprocessing
categorical_cols = ["ethnicity", "relation", "age_desc"]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ],
    remainder="passthrough"  # pass numeric as is
)

# Model pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# Save model
joblib.dump(pipeline, "autism_pipeline.pkl")
print("✅ Model saved as autism_pipeline.pkl")
print("Expected features:", list(X.columns))

KeyError: "['qchat_10_score', 'learning_disorder', 'speech_delay_language_disorder', 'global_developmental_delay_intellectual_disability', 'social_behavioral_issues'] not in index"

In [None]:
import joblib
joblib.dump(pipeline, "autism_pipeline.pkl")