# XGBoost

In [39]:
def code_ethnicity(ethinicity):

    if (ethinicity == 'white'): return 0
    elif (ethinicity == 'unknown'): return 1
    elif (ethinicity == 'minorities'): return 2

In [40]:
import csv
import numpy as np
import tensorflow as tf0
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost

data = pd.read_csv('cleaned_data.csv')
print(data.shape)
print(data.columns)
print(data.head())

data['ethnicity'] = data['ethnicity'].map(code_ethnicity)
X = data.drop('aki_stage', axis=1).values
y = data['aki_stage'].values - 1

scaler = StandardScaler()
X = scaler.fit_transform(X)

print(data.info())
# Separate training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_xgboost = xgboost.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.9,
                                      eval_metric='auc',
                                      verbosity=1)

eval_set = [(X_test, y_test)]

model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=eval_set,
                  verbose=True)

(1786, 44)
Index(['weight', 'aki_stage', 'delay_rrt', 'gender', 'admission_age',
       'ethnicity', 'hematocrit_min', 'hematocrit_max', 'hemoglobin_min',
       'hemoglobin_max', 'platelets_min', 'platelets_max', 'wbc_min',
       'wbc_max', 'aniongap_min', 'aniongap_max', 'bicarbonate_min',
       'bicarbonate_max', 'bun_min', 'bun_max', 'calcium_min', 'calcium_max',
       'chloride_min', 'chloride_max', 'creatinine_min', 'creatinine_max',
       'glucose_min', 'glucose_max', 'sodium_min', 'sodium_max',
       'potassium_min', 'potassium_max', 'inr_max', 'pt_max', 'ptt_max',
       'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean',
       'temperature_mean', 'spo2_mean', 'glucose_mean', 'gcs_min'],
      dtype='object')
   weight  aki_stage  delay_rrt  gender  admission_age ethnicity  \
0  104.50          1          1       1      66.262081   unknown   
1  100.00          1          1       1      70.489938   unknown   
2   98.95          3          0       1  



[12]	validation_0-auc:0.55347


In [44]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

y_pred = model_xgboost.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 69.55%


In [51]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=3):
        self.max_depth = max_depth
        self.tree = None

    def _calculate_gain(self, y, residuals, hessians):
        gradient = np.sum(residuals)
        hessian = np.sum(hessians)
        return gradient ** 2 / (hessian + 1e-6)

    def _split(self, X, y, residuals, hessians, feature_idx, threshold):
        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask
        return (X[left_mask], y[left_mask], residuals[left_mask], hessians[left_mask]), (X[right_mask], y[right_mask], residuals[right_mask], hessians[right_mask])

    def _find_best_split(self, X, y, residuals, hessians):
        best_split = {}
        best_gain = -np.inf
        n_samples, n_features = X.shape
        for feature_idx in range(n_features):
            feature_values = X[:, feature_idx]
            thresholds = np.unique(feature_values)
            for threshold in thresholds:
                (X_left, y_left, residuals_left, hessians_left), (X_right, y_right, residuals_right, hessians_right) = self._split(X, y, residuals, hessians, feature_idx, threshold)
                if len(X_left) == 0 or len(X_right) == 0:
                    continue
                gain = self._calculate_gain(y, residuals_left, hessians_left) + self._calculate_gain(y, residuals_right, hessians_right)
                if gain > best_gain:
                    best_gain = gain
                    best_split = {
                        'feature_idx': feature_idx,
                        'threshold': threshold,
                        'left': (X_left, y_left, residuals_left, hessians_left),
                        'right': (X_right, y_right, residuals_right, hessians_right)
                    }
        return best_split

    def _build_tree(self, X, y, residuals, hessians, depth):
        if depth == self.max_depth or len(X) < 2:
            return {'leaf_value': np.sum(residuals) / (np.sum(hessians) + 1e-6)}
        else:
            best_split = self._find_best_split(X, y, residuals, hessians)
            if not best_split:
                return {'leaf_value': np.sum(residuals) / (np.sum(hessians) + 1e-6)}
            left_subtree = self._build_tree(*best_split['left'], depth + 1)
            right_subtree = self._build_tree(*best_split['right'], depth + 1)
            return {
                'feature_idx': best_split['feature_idx'],
                'threshold': best_split['threshold'],
                'left': left_subtree,
                'right': right_subtree
            }

    def fit(self, X, y, residuals, hessians):
        self.tree = self._build_tree(X, y, residuals, hessians, depth=0)

    def _predict_single(self, x, node):
        if 'leaf_value' in node:
            return node['leaf_value']
        if x[node['feature_idx']] <= node['threshold']:
            return self._predict_single(x, node['left'])
        else:
            return self._predict_single(x, node['right'])

    def predict(self, X):
        return np.array([self._predict_single(x, self.tree) for x in X])

class XGBoost:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def _compute_gradients_and_hessians(self, y, y_pred):
        residuals = y - y_pred
        hessians = np.ones_like(y)
        return residuals, hessians

    def fit(self, X, y):
        y_pred = np.zeros_like(y, dtype=float)
        for _ in range(self.n_estimators):
            residuals, hessians = self._compute_gradients_and_hessians(y, y_pred)
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X, y, residuals, hessians)
            y_pred += self.learning_rate * tree.predict(X)
            self.trees.append(tree)

    def predict(self, X):
        y_pred = np.zeros(X.shape[0], dtype=float)
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred

    def predict_classes(self, X):
        y_pred = self.predict(X)
        return np.round(y_pred).astype(int)

# Create and train XGBoost model
xgb = XGBoost(n_estimators=100, learning_rate=0.1, max_depth=5)
print("run")
xgb.fit(X_train, y_train)

# Predict on test data
y_pred = xgb.predict_classes(X_test)

# Evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



run
Accuracy: 58.10%
