<a href="https://colab.research.google.com/github/Cheetahzzzz1/LinearDiscriminantAnalysis-on-DiabetesTraining.csv/blob/main/Linear_Discriminant_Analysis_on_DiabetesTraining_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Question 2-(a)

In [1]:
import pandas as pd

# Load the provided CSV file
file_path = "/content/DiabetesTraining.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,80.0,0,1,never,25.19,6.6,140,0
2,Female,54.0,0,0,No Info,27.32,6.6,80,0
3,Male,28.0,0,0,never,27.32,5.7,158,0
4,Female,36.0,0,0,current,23.45,5.0,155,0


In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Select the numerical features
numerical_features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
X = data[numerical_features]
y = data['diabetes']

# Standardize the features by dividing each feature by its standard deviation
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Calculate mean vectors for each class
mean_vector_0 = np.mean(X_standardized[y == 0], axis=0)
mean_vector_1 = np.mean(X_standardized[y == 1], axis=0)

# Calculate within-class scatter matrix
scatter_matrix_within = np.zeros((len(numerical_features), len(numerical_features)))
for label in [0, 1]:
    scatter_matrix_class = np.cov(X_standardized[y == label], rowvar=False) * (np.sum(y == label) - 1)
    scatter_matrix_within += scatter_matrix_class

# Calculate the linear discriminant vector w
w = np.linalg.inv(scatter_matrix_within).dot(mean_vector_1 - mean_vector_0)
w


array([0.00039097, 0.00047507, 0.00130421, 0.00107939])

The above vector indicates the contribution of each feature to the linear discriminant function.

In [None]:
# End of Question 2-(a)

In [None]:
# Question 2-(b)

In [3]:
# Define the Gini impurity and information gain functions
def gini_impurity(labels):
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    return 1 - np.sum(probabilities ** 2)

def information_gain(parent, left_child, right_child):
    n = len(parent)
    n_left, n_right = len(left_child), len(right_child)
    weighted_impurity = (n_left / n) * gini_impurity(left_child) + (n_right / n) * gini_impurity(right_child)
    return gini_impurity(parent) - weighted_impurity

# Extract labels and features of interest
labels = data['diabetes']
hypertension = data['hypertension']
heart_disease = data['heart_disease']

# Calculate initial Gini impurity for the entire dataset
initial_gini = gini_impurity(labels)

# Function to calculate Gini impurity after split for a given feature
def calculate_gini_after_split(feature):
    left_child = labels[feature == 0]
    right_child = labels[feature == 1]
    weighted_gini_after = (len(left_child) / len(labels)) * gini_impurity(left_child) + \
                          (len(right_child) / len(labels)) * gini_impurity(right_child)
    return weighted_gini_after, gini_impurity(left_child), gini_impurity(right_child)

# Gini impurity after splitting by 'hypertension'
gini_after_hypertension, gini_left_hyper, gini_right_hyper = calculate_gini_after_split(hypertension)

# Gini impurity after splitting by 'heart disease'
gini_after_heart_disease, gini_left_heart, gini_right_heart = calculate_gini_after_split(heart_disease)

# Calculate information gain for 'hypertension' and 'heart disease'
info_gain_hypertension = initial_gini - gini_after_hypertension
info_gain_heart_disease = initial_gini - gini_after_heart_disease

# Display results as a DataFrame
results = {
    "Attribute": ["Hypertension", "Heart Disease"],
    "Initial Gini Impurity": [initial_gini, initial_gini],
    "Gini Impurity After Split": [gini_after_hypertension, gini_after_heart_disease],
    "Gini Impurity Left Child": [gini_left_hyper, gini_left_heart],
    "Gini Impurity Right Child": [gini_right_hyper, gini_right_heart],
    "Information Gain": [info_gain_hypertension, info_gain_heart_disease]
}

# Convert results to DataFrame for display
results_df = pd.DataFrame(results)

# Display results in tabular form
display(results_df)

Unnamed: 0,Attribute,Initial Gini Impurity,Gini Impurity After Split,Gini Impurity Left Child,Gini Impurity Right Child,Information Gain
0,Hypertension,0.14711,0.142956,0.125686,0.370239,0.004154
1,Heart Disease,0.14711,0.146014,0.139396,0.32,0.001096


In [None]:
# End of Question 2-(b)