In [None]:
# Fully automatic ML Preprocessing Helper (Colab Version)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, r2_score

# -----------------------------
# Load dataset
# -----------------------------
data_path = '/content/disease_diagnosis.csv'  # change path if needed
df = pd.read_csv(data_path)
df = df.fillna(np.nan)

# -----------------------------
# Automatically set features and target
# -----------------------------
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].copy()
print(f"Selected Features (X): {list(X.columns)}")
print(f"Selected Target (y): {y.name}")

# -----------------------------
# Automatically drop likely ID columns
# -----------------------------
id_cols = [col for col in X.columns if X[col].nunique() == len(X)]
if id_cols:
    print(f"🚨 Dropping ID columns: {id_cols}")
    X = X.drop(columns=id_cols)

# -----------------------------
# Detect numeric and categorical columns
# -----------------------------
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=[object, 'category', 'bool']).columns.tolist()

# -----------------------------
# Check target distribution
# -----------------------------
print("\n🔍 Target Class Distribution:")
print(y.value_counts(normalize=True))

# -----------------------------
# Detect high-cardinality categorical columns
# -----------------------------
high_card_cols = [col for col in cat_cols if X[col].nunique() > 50]
low_card_cols = [col for col in cat_cols if col not in high_card_cols]
if high_card_cols:
    print(f"⚡ High cardinality columns detected: {high_card_cols}. Using OrdinalEncoder for these.")

# -----------------------------
# Dynamic scaling choice
# -----------------------------
scaler_choice = 'StandardScaler' if (num_cols and X[num_cols].var().max() > 1) else 'MinMaxScaler'

# -----------------------------
# Train/test split
# -----------------------------
stratify_arg = y if (y.nunique() <= 20 and y.dtype != float) else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify_arg
)

# -----------------------------
# Build preprocessing pipeline
# -----------------------------
transformers = []

if num_cols:
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler() if scaler_choice=='StandardScaler' else MinMaxScaler())
    ])
    transformers.append(('num', num_pipeline, num_cols))

if low_card_cols:
    cat_low_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    transformers.append(('cat_low', cat_low_pipeline, low_card_cols))

if high_card_cols:
    cat_high_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder())
    ])
    transformers.append(('cat_high', cat_high_pipeline, high_card_cols))

preprocessor = ColumnTransformer(transformers, remainder='drop')

# -----------------------------
# Fit and transform
# -----------------------------
X_train_proc = pd.DataFrame(preprocessor.fit_transform(X_train))
X_test_proc = pd.DataFrame(preprocessor.transform(X_test))

print("\nProcessed Features Sample:")
display(X_train_proc.head())

# -----------------------------
# Save processed datasets
# -----------------------------
train_df = X_train_proc.copy()
train_df[y.name] = y_train.reset_index(drop=True)
train_df.to_csv('/content/train_processed.csv', index=False)

test_df = X_test_proc.copy()
test_df[y.name] = y_test.reset_index(drop=True)
test_df.to_csv('/content/test_processed.csv', index=False)

print("\n✅ Processed train/test datasets saved to /content/ (no automatic download).")

# -----------------------------
# Model selection & training
# -----------------------------
# Define multiple models
models_classification = {
    "RandomForestClassifier": RandomForestClassifier(random_state=42),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
}

models_regression = {
    "RandomForestRegressor": RandomForestRegressor(random_state=42),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
    "LinearRegression": LinearRegression()
}

# Detect task type
if y.dtype == object or (y.nunique() <= 20 and y.dtype != float):
    task_type = 'classification'
    models_to_train = models_classification
else:
    task_type = 'regression'
    models_to_train = models_regression

print(f"\nDetected model type: {task_type}")
print("Training all models to recommend the best one...")

# Train models and evaluate
performance = {}
for name, model in models_to_train.items():
    model.fit(X_train_proc, y_train)
    y_pred = model.predict(X_test_proc)
    if task_type == 'classification':
        score = accuracy_score(y_test, y_pred)
    else:
        score = r2_score(y_test, y_pred)
    performance[name] = score

# Pick best model
best_model_name = max(performance, key=performance.get)
print("\nModel performance comparison:")
for k, v in performance.items():
    print(f"{k}: {v:.4f}")

print(f"\n✅ Recommended model: {best_model_name} (best performance)")
