# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Function to load data

In [2]:
def load_data(path):
    # Read the CSV file located at the specified 'path' into a Pandas DataFrame object
    df = pd.read_csv(path)
    df.shape

    # Display the first few rows of the DataFrame to the console, using the head() method,
    # to give an idea of what the data looks like before any preprocessing is done
    print(f"Data before Preprocessing:\n {df.head}")
    return df

# Function to preprocess data

In [3]:
def preprocess_data(df):
    # Perform label encoding on categorical columns to convert them into numerical values
    # that can be processed by machine learning algorithms
    label_encoder = LabelEncoder()
    df['gender'] = label_encoder.fit_transform(df['gender'])  # Encode gender column
    df['smoking_history'] = label_encoder.fit_transform(df['smoking_history'])  # Encode smoking history column

    # Convert the DataFrame to a NumPy array for further processing
    data = np.array(df)

    # Apply standard scaling to all columns except the last one (which is the target variable)
    # to have similar scales and prevent features with large ranges from dominating the model

    # Print the preprocessed data to the console for inspection
    print(f"\nData after Preprocessing: \n{data}\n")
    return data

# Feature Extraction

In [4]:
class FeatureExtractor():
    def __init__(self):
        pass

    def get_min_max_scale(self, X):
        return (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))


    def get_max_abs_scale(self, X):
        # Calculate the maximum absolute value for each feature
        max_abs = np.max(np.abs(X), axis=0)

        # Scale each feature by dividing it by its maximum absolute value
        scaled_data = X / max_abs

        return scaled_data


    def get_power_transformation(self, X, method='yeo-johnson', standardize=True):

        X = np.array(X)

        if standardize:
            X_std = (X - X.mean(axis=0)) / X.std(axis=0)
        else:
            X_std = X

        if method == 'yeo-johnson':
            lam = np.array([(np.median(np.abs(X_std[:, i] - np.median(X_std[:, i])))) /
                            (np.median(np.abs(X_std[:, i] + np.median(X_std[:, i])))) for i in range(X_std.shape[1])])
            transformed_X = np.sign(X_std) * (np.abs(X_std) + lam) ** (1 / (lam + 1e-2))
        elif method == 'box-cox':
            lam = np.array([(np.var(X_std[:, i]) / np.mean(X_std[:, i]**2)) for i in range(X_std.shape[1])])
            transformed_X = np.sign(X_std) * (np.abs(X_std) ** lam)
        else:
            raise ValueError("Method must be either 'yeo-johnson' or 'box-cox'")

        return transformed_X

    def get_standard_scale(self, X):

        # Calculate mean and standard deviation for each feature
        mean = np.mean(X, axis=0)
        std_dev = np.std(X, axis=0)

        # Scale the data
        scaled_data = (X - mean) / std_dev

        return scaled_data

    def fit_transform(self, X):
        min_max_scaled_X = self.get_min_max_scale(X)
        max_abs_scaled_X = self.get_max_abs_scale(X)
        power_transformated_X = self.get_power_transformation(X)
        standard_scaled_X = self.get_standard_scale(X)
        merged_X = np.hstack((min_max_scaled_X, max_abs_scaled_X, power_transformated_X, standard_scaled_X))
        return merged_X


# LDA class for Feature Reduction

In [5]:
class LDA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.linear_discriminants = None

    def fit(self, X, y):
        n_features = X.shape[1]
        class_labels = np.unique(y)

        mean_overall = np.mean(X, axis=0)
        SW = np.zeros((n_features, n_features))
        SB = np.zeros((n_features, n_features))
        for c in class_labels:
            X_c = X[y == c]
            mean_c = np.mean(X_c, axis=0)
            SW += (X_c - mean_c).T.dot((X_c - mean_c))

            n_c = X_c.shape[0]
            mean_diff = (mean_c - mean_overall).reshape(n_features, 1)
            SB += n_c * (mean_diff).dot(mean_diff.T)

        A = np.linalg.inv(SW).dot(SB)
        eigenvalues, eigenvectors = np.linalg.eigh(A)
        eigenvectors = eigenvectors.T
        idxs = np.argsort(abs(eigenvalues))[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[idxs]
        self.linear_discriminants = eigenvectors[0 : self.n_components]

    def transform(self, X):
        return np.dot(X, self.linear_discriminants.T)

# Naive Bayes Classifier

In [6]:
class GaussianNaiveBayes:
    def __init__(self):
        # Initialize dictionaries to store class priors, means, and variances
        self.class_prior = {}
        self.mean = {}
        self.variance = {}

    def fit(self, X, y):
        # Get unique classes
        self.classes = np.unique(y)

        # Iterate over each class
        for c in self.classes:
            # Filter input features for the current class
            X_c = X[y == c]

            # Calculate class prior probability
            self.class_prior[c] = len(X_c) / len(X)

            # Calculate mean and variance for the current class
            self.mean[c] = np.mean(X_c, axis=0)
            self.variance[c] = np.var(X_c, axis=0)

    def gaussian_pdf(self, x, mean, variance):
        # Calculate the Gaussian (normal) probability density function
        exponent = -((x - mean) ** 2) / ((2 * variance) + 10e-6) # to avoid divide by zero
        pdf = np.exp(exponent) / (np.sqrt(2 * np.pi * variance) + 10e-6) # to avoid divide by zero
        return pdf

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for c in self.classes:
                # Get class prior probability
                prior = self.class_prior[c]

                # Calculate likelihood using Gaussian PDF
                likelihood = np.prod(self.gaussian_pdf(x, self.mean[c], self.variance[c]))

                # Calculate posterior probability
                posterior = prior * likelihood
                posteriors.append(posterior)

            # Get the index of the highest posterior probability class
            predictions.append(self.classes[np.argmax(posteriors)])
        return predictions

# Function to get the accuracy

In [7]:
def accuracy_rate(y, predictions):
    # Calculate the number of correct predictions
    # by iterating over the true labels (y) and predicted labels (predictions)
    # and summing up the number of matches
    correct = sum(1 for i in range(len(y)) if y[i] == predictions[i])

    # Calculate the accuracy rate as a percentage
    # by dividing the number of correct predictions by the total number of samples
    # and multiplying by 100
    return (correct / float(len(y))) * 100.0

# Feature Selection (Using Grey Wolf Optimization)

In [8]:
def applymask(train_data,mask):
    mask = np.where(mask > 0, True, False)
    #Apply Mask
    new_data = np.array(train_data[0][mask])
    for i in range(1,len(train_data)):
        new_data = np.vstack((new_data,train_data[i][mask]))

    return new_data

In [9]:
# this is used to evaluate the feature / mainly it train the model and use it on subset of data
def fitness(train_data, validation_data, mask, score):
    # Unpack the data tuples
    x_train, y_train = train_data
    x_val, y_val = validation_data

    # Apply the mask (on Train Data only)
    x_train = x_train * mask

    # Create and train the SVM model
    model = GaussianNaiveBayes()
    model.fit(x_train, y_train)

    # Make predictions on validation data
    y_pred = model.predict(x_val)

    # Calculate the desired score
    if score == 'accuracy':
        result = accuracy_score(y_val, y_pred)
    elif score == 'F1':
        result = f1_score(y_val, y_pred, average='weighted')
    elif score == 'Precision':
        result = precision_score(y_val, y_pred, average='weighted')
    elif score == 'recall':
        result = recall_score(y_val, y_pred, average='weighted')
    else:
        raise ValueError("Invalid score. Please choose from 'accuracy', 'f1', 'precision', or 'recall'.")

    return result

In [10]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [11]:
def CalX(top_sample,curr_sample,a): #top sample represent alpha or beta or delta wolf
    # Define Parameter
    features_shape = curr_sample.shape
    A = 2*a*np.random.rand(*features_shape) - a
    C = 2*np.random.rand(*features_shape)
    D = abs(C*top_sample - curr_sample)

    # Calculating X_
    cstep = sigmoid(10*(A*D-0.5))
    bstep = np.where(cstep >= np.ones(features_shape),1,0)
    X = np.where(top_sample + bstep >= np.ones(features_shape),1,0)
    return X

In [12]:
def GWO_SVM(train_data,validation_data,score="accuracy",population_size=10,maxiter=5):
    # Initialize the population (wolfs), which actuallly represent masks
    population = np.random.randint(2,size = (population_size,*train_data[0].shape[1:])) # assuming it represent feature importance to add some diversity
    fitness_val = np.zeros(population_size)
    features_shape = train_data[0].shape[1:]

    # Calculating fitness value
    for sample in range(population_size):
        fitness_val[sample] = fitness(train_data,validation_data,population[sample],score)
        delta, beta, alpha = np.argsort(fitness_val)[-3:] # alpha in last so order is delta, beta, alpha


    for curr_iter in range(maxiter):
        # Initialize parameters;
        a = 2*(1 - curr_iter/maxiter) # linearly decreased from 2 to 0

        for idx in range(population_size):
            # Calculating for X1,X2 and X3
            X1 = CalX(alpha,population[idx],a)
            X2 = CalX(beta,population[idx],a)
            X3 = CalX(delta,population[idx],a)

            # Greedy Selection
            new_point = sigmoid(np.mean((X1,X2,X3),axis=0)) #sigmoid function
            new_point = np.where(new_point>np.random.rand(*features_shape),1,0) #make it discrete
            new_fit = fitness(train_data,validation_data,new_point,score)

            # Update if Better
            if new_fit >= fitness_val[idx]:
                fitness_val[idx] = new_fit
                population[idx] = new_point


            # Recalculate new delta, beta, alpha
            delta, beta, alpha = np.argsort(fitness_val)[-3:]


    return population[alpha]

In [13]:
def select_features(X,y):
    # split the data to validation and train data
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    res = GWO_SVM((x_train, y_train), (x_valid, y_valid),population_size=10,maxiter=5)

    #Apply Mask
    res = np.where(res > 0, True, False)
    print(f"From {len(res)}. The GWO selected Only {np.count_nonzero(res)} Features")

    new_data = np.array(X[0][res])
    for i in range(1,len(X)):
        new_data = np.vstack((new_data,X[i][res]))
    print(f"Shape of data after Feature Selection: {new_data.shape}\n")

    return new_data

# Function to run the whole program

In [14]:
def run():
    # Load the diabetes prediction dataset from a CSV file
    df = load_data("/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv")

    # Print the number of features in the data (excluding the target variable)
    print(f"\nNo. features in data: {df.iloc[:,:-1].shape[1]}\n")

    # Preprocess the data (e.g., label encoding, standard scaling)
    data = preprocess_data(df)
    X=data[:,:-1]
    y=data[:,-1]

    fe = FeatureExtractor()
    extracted_features = fe.fit_transform(X)
    print(f"No. Extractd features: {extracted_features.shape[1]}\n")

    lda = LDA(20)
    lda.fit(extracted_features, y)
    X_projected = lda.transform(extracted_features)
    
    print(f"Shape of data after Feature Reduction: {X_projected.shape}\n")

    # paper name: https://www.hindawi.com/journals/cmmm/2017/9512741/

    X_projected = select_features(X_projected,y)

    # Split the preprocessed data into training and testing sets (80% for training, 20% for testing)
    X_train, X_test, y_train, y_test = train_test_split(X_projected, y, test_size=0.2, random_state=42)

    # Create an instance of the Naive Bayes classifier
    nb = GaussianNaiveBayes()

    # Train the Naive Bayes model on the training data
    nb.fit(X_train, y_train)

    # Use the trained model to make predictions on the testing data
    y_pred = nb.predict(X_test)

    # Evaluate the accuracy of the model by comparing the predicted labels with the true labels
    print("Accuracy: ", accuracy_rate(y_test, y_pred))

# Run With Feature Selection

In [16]:
run()

Data before Preprocessing:
 <bound method NDFrame.head of        gender   age  hypertension  heart_disease smoking_history    bmi  \
0      Female  80.0             0              1           never  25.19   
1      Female  54.0             0              0         No Info  27.32   
2        Male  28.0             0              0           never  27.32   
3      Female  36.0             0              0         current  23.45   
4        Male  76.0             1              1         current  20.14   
...       ...   ...           ...            ...             ...    ...   
99995  Female  80.0             0              0         No Info  27.32   
99996  Female   2.0             0              0         No Info  17.37   
99997    Male  66.0             0              0          former  27.83   
99998  Female  24.0             0              0           never  35.42   
99999  Female  57.0             0              0         current  22.43   

       HbA1c_level  blood_glucose_level  