In [3]:
import numpy as np
import pandas as pd

In [4]:
# Data Collection
data =  pd.read_csv("diabetes_dataset.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
# Standardization

# Example dataset (rows = samples, columns = features)
# data = np.array([
#     [2.5, 2.4],
#     [0.5, 0.7],
#     [2.2, 2.9],
#     [1.9, 2.2],
#     [3.1, 3.0],
#     [2.3, 2.7],
#     [2.0, 1.6],
#     [1.0, 1.1],
#     [1.5, 1.6],
#     [1.1, 0.9]
# ])

# 1. Calculate mean for each feature (column)
mean = np.mean(data, axis=0)

# 2. Calculate standard deviation for each feature (column)
std_dev = np.std(data, axis=0)

# 3. Standardize the dataset
standardized_data = (data - mean) / std_dev

print("Original Data:")
print(data)
print("\nMean of each feature:")
print(mean)
print("\nStandard Deviation of each feature:")
print(std_dev)
print("\nStandardized Data:")
print(standardized_data)


Original Data:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1   

In [6]:
# Covariance Matrix Calculation

# Step 1: Calculate the covariance matrix
# Method 1: Using the formula
n_samples = standardized_data.shape[0]
cov_matrix_manual = (standardized_data.T @ standardized_data) / (n_samples - 1)

# Method 2: Using NumPy's built-in function for validation (For test)
# cov_matrix_numpy = np.cov(standardized_data, rowvar=False)

print("Standardized Data:")
print(standardized_data)
print("\nCovariance Matrix (Manual Calculation):")
print(cov_matrix_manual)

# For test
# print("\nCovariance Matrix (Using np.cov):") 
# print(cov_matrix_numpy)

Standardized Data:
     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0       0.639947  0.848324       0.149641       0.907270 -0.692891  0.204013   
1      -0.844885 -1.123396      -0.160546       0.530902 -0.692891 -0.684422   
2       1.233880  1.943724      -0.263941      -1.288212 -0.692891 -1.103255   
3      -0.844885 -0.998208      -0.160546       0.154533  0.123302 -0.494043   
4      -1.141852  0.504055      -1.504687       0.907270  0.765836  1.409746   
..           ...       ...            ...            ...       ...       ...   
763     1.827813 -0.622642       0.356432       1.722735  0.870031  0.115169   
764    -0.547919  0.034598       0.046245       0.405445 -0.692891  0.610154   
765     0.342981  0.003301       0.149641       0.154533  0.279594 -0.735190   
766    -0.844885  0.159787      -0.470732      -1.288212 -0.692891 -0.240205   
767    -0.844885 -0.873019       0.046245       0.656358 -0.692891 -0.202129   

     DiabetesPedigre

In [9]:
# Eigenvalues and Eigenvectors

# Step 1: Calculate eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix_manual)

# Step 2: Sort eigenvalues and eigenvectors in descending order of eigenvalues
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Display results
print("Covariance Matrix:")
print(cov_matrix_manual)
print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

Covariance Matrix:
                          Pregnancies   Glucose  BloodPressure  SkinThickness  \
Pregnancies                  1.001304  0.129627       0.141466      -0.081778   
Glucose                      0.129627  1.001304       0.152789       0.057403   
BloodPressure                0.141466  0.152789       1.001304       0.207641   
SkinThickness               -0.081778  0.057403       0.207641       1.001304   
Insulin                     -0.073630  0.331789       0.089049       0.437352   
BMI                          0.017706  0.221359       0.282173       0.393085   
DiabetesPedigreeFunction    -0.033566  0.137516       0.041319       0.184167   
Age                          0.545051  0.263858       0.239840      -0.114119   
Outcome                      0.222187  0.467190       0.065153       0.074850   

                           Insulin       BMI  DiabetesPedigreeFunction  \
Pregnancies              -0.073630  0.017706                 -0.033566   
Glucose               