In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random

# Load the dataset from the uploaded file
file_path = 'healthcare-dataset-stroke-data.csv'
df = pd.read_csv(file_path)

# Data Preprocessing
# Handle missing values
for column in df.select_dtypes(include=[np.number]).columns:
    df[column].fillna(df[column].mean(), inplace=True)
for column in df.select_dtypes(include=[object]).columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Convert categorical columns to numeric
df = pd.get_dummies(df, drop_first=True)

# Separate features and target
X = df.drop("stroke", axis=1)
y = df["stroke"]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the Isolation Tree class
class IsolationTree:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, current_depth=0):
        if current_depth >= self.max_depth or len(X) <= 1:
            return None

        # Randomly select a feature
        feature_idx = random.randint(0, X.shape[1] - 1)
        feature_values = X[:, feature_idx]

        # Randomly select a split value
        split_value = random.uniform(np.min(feature_values), np.max(feature_values))

        # Split the data
        left_split = X[X[:, feature_idx] < split_value]
        right_split = X[X[:, feature_idx] >= split_value]

        left_tree = self.fit(left_split, current_depth + 1)
        right_tree = self.fit(right_split, current_depth + 1)

        self.tree = (feature_idx, split_value, left_tree, right_tree)
        return self.tree

    def path_length(self, x, current_depth=0):
        if self.tree is None:
            return current_depth

        feature_idx, split_value, left_tree, right_tree = self.tree
        if x[feature_idx] < split_value:
            if left_tree is None:
                return current_depth + 1
            return self.path_length_from_tree(left_tree, x, current_depth + 1)
        else:
            if right_tree is None:
                return current_depth + 1
            return self.path_length_from_tree(right_tree, x, current_depth + 1)

    def path_length_from_tree(self, tree, x, current_depth):
        if tree is None:
            return current_depth

        feature_idx, split_value, left_tree, right_tree = tree
        if x[feature_idx] < split_value:
            if left_tree is None:
                return current_depth + 1
            return self.path_length_from_tree(left_tree, x, current_depth + 1)
        else:
            if right_tree is None:
                return current_depth + 1
            return self.path_length_from_tree(right_tree, x, current_depth + 1)

# Define the Isolation Forest class
class IsolationForestCustom:
    def __init__(self, n_trees=100, sample_size=256):
        self.n_trees = n_trees
        self.sample_size = sample_size
        self.forest = []

    def fit(self, X):
        self.forest = []
        max_depth = int(np.log2(self.sample_size))
        for _ in range(self.n_trees):
            sample_indices = np.random.choice(X.shape[0], self.sample_size, replace=False)
            sample = X[sample_indices]
            tree = IsolationTree(max_depth)
            tree.fit(sample)
            self.forest.append(tree)

    def path_length(self, x):
        return np.mean([tree.path_length(x) for tree in self.forest])

    def anomaly_score(self, X):
        scores = []
        for x in X:
            avg_path_length = self.path_length(x)
            score = 2 ** (-avg_path_length / c(self.sample_size))
            scores.append(score)
        return scores

def c(n):
    return 2 * (np.log(n - 1) + 0.5772156649) - 2 * (n - 1) / n

# Train the Isolation Forest
iso_forest_custom = IsolationForestCustom(n_trees=100, sample_size=256)
iso_forest_custom.fit(X_train)

# Get anomaly scores
scores = iso_forest_custom.anomaly_score(X_test)
print(scores)

[0.5943452638012681, 0.5847721774535569, 0.5847721774535569, 0.5919373992610989, 0.5820091835628013, 0.5871508955273356, 0.5915370379721369, 0.586356916322361, 0.5939432739351083, 0.5851679601817475, 0.5820091835628013, 0.5824030962545877, 0.5820091835628013, 0.5847721774535569, 0.5951500599363476, 0.5879459498504855, 0.5831917216369561, 0.5843766624159757, 0.5867537716259872, 0.5839814148879495, 0.5827972755524916, 0.5820091835628013, 0.5871508955273356, 0.5843766624159757, 0.5820091835628013, 0.5820091835628013, 0.5835864346885462, 0.5827972755524916, 0.586356916322361, 0.5851679601817475, 0.5824030962545877, 0.5931401096854557, 0.5824030962545877, 0.5871508955273356, 0.5839814148879495, 0.5887420807476159, 0.5843766624159757, 0.5963592979098824, 0.5827972755524916, 0.5827972755524916, 0.5891405503669019, 0.5847721774535569, 0.5835864346885462, 0.5899382988589509, 0.5824030962545877, 0.5824030962545877, 0.5843766624159757, 0.5963592979098824, 0.5824030962545877, 0.5843766624159757, 0