In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeRegressor
from collections import defaultdict

# Softmax function for multi-class probabilities
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Stability trick
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Custom CatBoost-like multi-class model
class CatBoostLikeMultiClass:
    def __init__(self, n_estimators=1000, learning_rate=0.01, max_depth=7, cat_features=None):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []  # Stores decision trees for each class
        self.cat_features = cat_features if cat_features else []
        self.target_encoding = {}  # Stores target encoding for categorical features
        self.num_classes = None  # Number of target classes

    # Target encoding for categorical features
    def _target_encode(self, X, y):
        encoded_X = X.copy()
        self.target_encoding = {}
        for col in self.cat_features:
            means = defaultdict(lambda: np.mean(y))  # Default value is the mean of y
            unique_vals = X[col].unique()
            for val in unique_vals:
                means[val] = np.mean(y[X[col] == val])  # Mean target value for each category
            encoded_X[col] = X[col].map(means)
            self.target_encoding[col] = means
        return encoded_X

    # Apply target encoding to new data
    def _apply_target_encoding(self, X):
        X_encoded = X.copy()
        for col in self.cat_features:
            X_encoded[col] = X[col].map(self.target_encoding.get(col, {})).fillna(np.mean(list(self.target_encoding[col].values())))
        return X_encoded

    # Fit the model
    def fit(self, X, y):
        # Convert target to one-hot encoding
        self.num_classes = len(np.unique(y))
        y_one_hot = np.eye(self.num_classes)[y]

        # Encode categorical features
        X = self._target_encode(X, y)

        # Initialize predictions (logits)
        F = np.zeros((y.shape[0], self.num_classes))

        # Gradient boosting loop
        for _ in range(self.n_estimators):
            # Compute softmax probabilities
            p = softmax(F)
            residuals = y_one_hot - p  # Compute residuals

            # Train a tree for each class
            trees_for_iteration = []
            for class_idx in range(self.num_classes):
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X, residuals[:, class_idx])  # Train tree on class-specific residuals
                trees_for_iteration.append(tree)

            self.trees.append(trees_for_iteration)

            # Update predictions
            for class_idx, tree in enumerate(trees_for_iteration):
                F[:, class_idx] += self.learning_rate * tree.predict(X)

    # Predict probabilities
    def predict_proba(self, X):
        X = self._apply_target_encoding(X)
        F = np.zeros((X.shape[0], self.num_classes))

        for trees_for_iteration in self.trees:
            for class_idx, tree in enumerate(trees_for_iteration):
                F[:, class_idx] += self.learning_rate * tree.predict(X)

        return softmax(F)

    # Predict classes
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

# Load and preprocess data
file_path = 'D:\Python\CatBoost\heart_disease_uci.csv'
data = pd.read_csv(file_path)

# Check for missing values
print(data.isnull().sum())

# Remove unnecessary columns
data.drop(columns=['sex', 'dataset', 'id', 'ca'], inplace=True)

# Separate features and target
features = data.drop('num', axis=1)
target = data['num']

# Categorical features
categorical_features = ['cp', 'restecg', 'slope', 'thal']

# Fill categorical NaNs with 'Missing'
for col in categorical_features:
    features[col] = features[col].fillna('Missing')

# Fill numeric NaNs with the median
numeric_features = features.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_features:
    features[col] = features[col].fillna(features[col].median())

# Handle binary columns with NaNs
features['fbs_missing'] = features['fbs'].isna().astype(int)
features['exang_missing'] = features['exang'].isna().astype(int)
features['fbs'] = features['fbs'].fillna('FALSE').astype(str)
features['exang'] = features['exang'].fillna('FALSE').astype(str)
features['fbs'] = features['fbs'].replace({'True': 'TRUE', 'False': 'FALSE', 'FALSE': 'FALSE'})
features['exang'] = features['exang'].replace({'True': 'TRUE', 'False': 'FALSE', 'FALSE': 'FALSE'})
features['fbs'] = features['fbs'].map({'FALSE': 0, 'TRUE': 1})
features['exang'] = features['exang'].map({'FALSE': 0, 'TRUE': 1})
features = features.drop(['fbs_missing', 'exang_missing'], axis=1)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and train the model
model = CatBoostLikeMultiClass(n_estimators=1000, learning_rate=0.01, max_depth=7, cat_features=categorical_features)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')


id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64
Accuracy: 0.5543


In [3]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix

roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred) * 100, "%")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc)

Accuracy: 55.434782608695656 %
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.83      0.77        75
           1       0.46      0.39      0.42        54
           2       0.32      0.32      0.32        25
           3       0.48      0.38      0.43        26
           4       0.20      0.25      0.22         4

    accuracy                           0.55       184
   macro avg       0.43      0.43      0.43       184
weighted avg       0.54      0.55      0.54       184

Confusion Matrix:
 [[62  9  3  1  0]
 [17 21 10  4  2]
 [ 4  7  8  5  1]
 [ 4  8  3 10  1]
 [ 0  1  1  1  1]]
ROC-AUC Score: 0.7900648047065333
