In [10]:

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# =========================
# Configuration
# =========================
DATA_PATH = "dataset.csv"
TARGET = "genetic_disorder"
TEST_SIZE = 0.2
RANDOM_SEED = 42

# Custom feature weights (all 11 features in your dataset)
CUSTOM_WEIGHTS = {
    "blood_cell_count_mcl": 10.098075,
    "white_blood_cell_count_thousand_per_microliter": 9.254416,
    "mothers_age": 8.798428,
    "fathers_age": 8.097246,
    "patient_age": 7.595741,
    "genes_in_mothers_side": 5.173580,
    "inherited_from_father": 4.796130,
    "no_of_previous_abortion": 4.235090,
    "ho_substance_abuse": 3.956756,
    "birth_asphyxia": 3.924847,
    "paternal_gene": 3.540366
}

# =========================
# Load Data
# =========================
print("=" * 60)
print("Loading data...")
print("=" * 60)

df = pd.read_csv(DATA_PATH)
print(f"[OK] Loaded {len(df)} records")
print(f"[OK] Features: {len(df.columns) - 1}")
print(f"[OK] No missing values: {df.isnull().sum().sum() == 0}")

X = df.drop(columns=[TARGET])
y = df[TARGET]

print(f"\nTarget distribution:")
for cls, count in y.value_counts().items():
    print(f"  {cls}: {count} ({count/len(y)*100:.1f}%)")

# =========================
# Categorical Features
# =========================
cat_features = [i for i, col in enumerate(X.columns) if X[col].dtype == "object"]
print(f"\n[OK] Categorical features: {len(cat_features)}")

# =========================
# Feature Penalties
# =========================
print("\n" + "=" * 60)
print("Feature penalties (lower = more important):")
print("=" * 60)

max_weight = max(CUSTOM_WEIGHTS.values())
feature_penalties = []

for col in X.columns:
    normalized_weight = CUSTOM_WEIGHTS[col] / max_weight
    penalty = (1 - normalized_weight) * 0.5
    feature_penalties.append(penalty)
    print(f"  {col}: weight={CUSTOM_WEIGHTS[col]:.2f}, penalty={penalty:.3f}")

# =========================
# Train-Test Split
# =========================
print("\n" + "=" * 60)
print("Splitting data...")
print("=" * 60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

print(f"[OK] Training: {len(X_train)} samples")
print(f"[OK] Test: {len(X_test)} samples")

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# =========================
# Train Model
# =========================
print("\n" + "=" * 60)
print("Training CatBoost...")
print("=" * 60)

model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    eval_metric="Accuracy",
    random_seed=RANDOM_SEED,
    verbose=100,
    per_object_feature_penalties=feature_penalties,
    bootstrap_type='Bayesian',
    bagging_temperature=1.0,
    rsm=0.85,
    min_data_in_leaf=15,
    max_ctr_complexity=4,
    auto_class_weights='Balanced'
)

model.fit(train_pool, eval_set=test_pool)

print("\n[OK] Training completed!")

# =========================
# Evaluation
# =========================
print("\n" + "=" * 60)
print("Model Evaluation")
print("=" * 60)

y_pred = model.predict(X_test).reshape(-1)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nTest Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

print("\nClassification Report:")
print("-" * 60)
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print("-" * 60)
print(confusion_matrix(y_test, y_pred))

# =========================
# Feature Importance
# =========================
print("\n" + "=" * 60)
print("Feature Importance")
print("=" * 60)

importances = model.get_feature_importance()
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance (%)": importances
}).sort_values(by="Importance (%)", ascending=False).reset_index(drop=True)

print(feature_importance_df.to_string(index=False))

# Save outputs
model.save_model("catboost_final_model.cbm")

print(f"\n[OK] Feature importance saved: feature_importance_final.csv")
print(f"[OK] Model saved: catboost_final_model.cbm")

print("\n" + "=" * 60)
print("DONE!")
print("=" * 60)

Loading data...
[OK] Loaded 22083 records
[OK] Features: 11
[OK] No missing values: True

Target distribution:
  mitochondrial genetic inheritance disorders: 12348 (55.9%)
  single-gene inheritance diseases: 7664 (34.7%)
  multifactorial genetic inheritance disorders: 2071 (9.4%)

[OK] Categorical features: 5

Feature penalties (lower = more important):
  blood_cell_count_mcl: weight=10.10, penalty=0.000
  white_blood_cell_count_thousand_per_microliter: weight=9.25, penalty=0.042
  mothers_age: weight=8.80, penalty=0.064
  fathers_age: weight=8.10, penalty=0.099
  patient_age: weight=7.60, penalty=0.124
  genes_in_mothers_side: weight=5.17, penalty=0.244
  inherited_from_father: weight=4.80, penalty=0.263
  no_of_previous_abortion: weight=4.24, penalty=0.290
  ho_substance_abuse: weight=3.96, penalty=0.304
  birth_asphyxia: weight=3.92, penalty=0.306
  paternal_gene: weight=3.54, penalty=0.325

Splitting data...
[OK] Training: 17666 samples
[OK] Test: 4417 samples

Training CatBoost...