In [None]:
import os
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
Path1ToLocal = "heart_2020_cleaned.csv"
Path1ToKaggle = "/kaggle/input/personal-key-indicators-of-heart-disease/2020/heart_2020_cleaned.csv"

Path2ToLocal = "glass.csv"
Path2ToKaggle = "/kaggle/input/glass/glass.csv"

Path1 = Path1ToLocal if os.path.exists(Path1ToLocal) else Path1ToKaggle
Path2 = Path2ToLocal if os.path.exists(Path2ToLocal) else Path2ToKaggle

In [None]:
data1 = pd.read_csv(Path1)
data1

In [None]:
data1.Sex = data1.Sex.map({'Male': 1, 'Female': 0})
data1.AgeCategory = data1.AgeCategory.map({'18-24': 1, '40-44': 2, '45-49': 3, '50-54': 4, '55-59': 5, '60-64': 6, '65-69': 7, '70-74': 8, '75-79': 9, '80 or older': 10, })
data1 = pd.get_dummies(data1, columns=['Race'])
data1.Diabetic = data1.Diabetic.map({'Yes': 1, 'Yes (during pregnancy)': 1, 'No': 0, 'No, borderline diabetes': 0, })
data1.GenHealth = data1.GenHealth.map({'Excellent': 5, 'Very good': 4, 'Good': 3, 'Fair': 2, 'Poor': 1, })
data1.DiffWalking = data1.DiffWalking.map({'Yes': 1, 'No': 0, })
data1.SkinCancer = data1.SkinCancer.map({'Yes': 1, 'No': 0, })
data1.HeartDisease = data1.HeartDisease.map({'Yes': 1, 'No': 0, })
data1.Smoking = data1.Smoking.map({'Yes': 1, 'No': 0, })
data1.AlcoholDrinking = data1.AlcoholDrinking.map({'Yes': 1, 'No': 0, })
data1.Stroke = data1.Stroke.map({'Yes': 1, 'No': 0})
data1.PhysicalActivity = data1.PhysicalActivity.map({'Yes': 1, 'No': 0})
data1.Asthma = data1.Asthma.map({'Yes': 1, 'No': 0, })
data1.KidneyDisease = data1.KidneyDisease.map({'Yes': 1, 'No': 0, })

In [None]:
data1.isnull().sum()

In [None]:
data1 = data1.astype(np.float32)

In [None]:
data1.dropna(inplace=True)

In [None]:
y = data1['HeartDisease']

In [None]:
data1 = data1.drop(columns=['HeartDisease'])

In [None]:
# check the correlation between features
plt.figure(figsize=(20, 20))
sns.heatmap(data1.corr(), annot=True, cmap='coolwarm')

In [None]:
data1 = np.array(data1)
data1

In [None]:
scaler = MinMaxScaler()
X_centered1 = scaler.fit_transform(data1)

# Applying Fuzzy C-Means Clustering

In [None]:
def fuzzy_c_means(data, num_clusters, fuzziness, max_iters=10, tol=1e-10):
    # Step 1: Initialize membership matrix
    membership_matrix = np.random.rand(data.shape[0], num_clusters)
    membership_matrix /= np.sum(membership_matrix, axis=1, keepdims=True)

    for _ in range(max_iters):
        # Step 2: Compute centroids
        centroids = np.zeros((num_clusters, data.shape[1]))
        denominator = np.zeros((num_clusters, 1))
        for i in range(data.shape[0]):
            for j in range(num_clusters):
                centroids[j] += (membership_matrix[i, j] ** fuzziness) * data[i]
                denominator[j] += membership_matrix[i, j] ** fuzziness
        centroids /= denominator  # A / B
        # Step 3: Calculate Euclidean distance
        distances = np.zeros((data.shape[0], num_clusters))
        for i in range(data.shape[0]):
            for j in range(num_clusters):
                distance_ij = np.sqrt(np.sum((data[i] - centroids[j]) ** 2))
                distances[i, j] = distance_ij

                # Step 4: Update membership matrix
        updated_membership_matrix = np.zeros_like(membership_matrix)
        for i in range(data.shape[0]):
            for j in range(num_clusters):
                if distances[i, j] > 0:
                    updated_membership_matrix[i, j] = 1 / np.sum((distances[i, :] / distances[i, j]) ** (2 / (fuzziness - 1)))
                else:
                    updated_membership_matrix[i, j] = 1
                    # Step 5: Check convergence 
        if np.linalg.norm(updated_membership_matrix - membership_matrix) < tol:
            break
        membership_matrix = updated_membership_matrix
        # Print intermediate results
    return membership_matrix, centroids


# Vectorization

In [None]:
def vectorized_fuzzy_c_means(data, num_clusters, fuzziness, max_iters=100, tol=1e-4):
    # Step 1: Initialize membership matrix
    membership_matrix = np.random.rand(data.shape[0], num_clusters)
    membership_matrix /= np.sum(membership_matrix, axis=1, keepdims=True)

    for _ in range(max_iters):
        # Step 2: Compute centroids
        centroids = np.dot(membership_matrix.T ** fuzziness, data) / (np.sum(membership_matrix ** fuzziness, axis=0, keepdims=True)).T

        # Step 3: Calculate Euclidean distance
        distances = np.linalg.norm(data[:, np.newaxis, :] - centroids, axis=2)

        # Step 4: Update membership matrix
        updated_membership_matrix = 1 / np.sum((distances[:, :, np.newaxis] / distances[:, np.newaxis, :]) ** (2 / (fuzziness - 1)), axis=2)

        # Step 5: Check convergence
        if np.linalg.norm(updated_membership_matrix - membership_matrix) < tol:
            break
        membership_matrix = updated_membership_matrix
    return membership_matrix, centroids

In [None]:
# # Define the data
# data = np.array([[1, 2], [2, 3], [9, 4], [10, 1]])

In [None]:
num_clusters = 2
fuzziness = 2
membership_matrix, centroids = fuzzy_c_means(data1, num_clusters, fuzziness)

In [None]:
# Final Membership Matrix:
membership_matrix

In [None]:
# Final Centroids:
centroids

In [None]:
membership_matrix.shape, centroids.shape

# Applying PCA from Scratch

In [None]:
def manual_covariance_matrix(data):
    X_mean = np.mean(data, axis=0)
    x_centered = (data - X_mean)
    num_samples = data.shape[0]
    num_features = data.shape[1]

    covarianceMatrix = np.zeros((num_features, num_features))

    for i in range(num_features):
        for j in range(num_features):
            covarianceMatrix[i, j] = np.sum((data[:, i] - X_mean[i]) * (data[:, j] - X_mean[j])) / (num_samples - 1)

    return covarianceMatrix, x_centered

In [None]:
covariance_matrix = np.dot(X_centered1.T, X_centered1) / len(X_centered1)

In [None]:
X_centered1.shape

In [None]:
plt.scatter(X_centered1[:, 0], X_centered1[:, 1])

In [None]:
def calculate_eigenvalues_and_eigenvectors(A, tol=1e-10, max_iter=1000000):
    if A.ndim != 2 or A.shape[0] != A.shape[1]: raise ValueError("Matrix A must be square.")
    lambdaMax = 0
    # Initialize random vector
    v = np.ones(A.shape[0])
    for _ in range(max_iter):
        # Update vector
        v = A @ v
        # Normalize
        v_norm = np.linalg.norm(v)
        v /= v_norm
        # Calculate eigenvalue
        lambda_new = np.dot(A @ v, v)
        # Check convergence
        if abs(lambda_new - lambdaMax) < tol: break
        # Update dominant eigenvalue
        lambdaMax = lambda_new
    return lambdaMax, v

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
eigenvalues.max()

In [None]:
eigenvectors[eigenvalues.argmax()]

In [None]:
# Normalize eigenvectors
#normalized_eigenvectors = eigenvectors / np.linalg.norm(eigenvectors, axis=0)
normalized_eigenvectors = eigenvectors / np.sqrt(np.sum(eigenvectors ** 2, axis=0))

# Construct matrix Q
Q = normalized_eigenvectors

# Print the resulting matrix Q
print(f"Matrix Q:\n{Q.shape}")

In [None]:
def transform_data(F, Q, num_components):
    return F @ Q[:, :num_components]

In [None]:
def inverse_transform(x_transformed, Q, num_components):
    return x_transformed @ Q[:, :num_components].T + np.mean(x_transformed, axis=0)

In [None]:
X_transformed = transform_data(X_centered1, Q, X_centered1.shape[1])

In [None]:
X_transformed.shape

In [None]:
plt.scatter(X_transformed[:, 0], X_transformed[:, 1])

In [None]:
X_reconstructed = inverse_transform(X_transformed, Q, X_transformed.shape[1])

In [None]:
X_reconstructed.shape

In [None]:
plt.scatter(X_reconstructed[:, 0], X_reconstructed[:, 1])

In [None]:
k_values = range(1, data1.shape[1])

# Initialize variables for tracking the best result
best_reconstruction_error = np.inf
best_Q_matrix = None
best_k = None

# Experiment with different numbers of retained components
for k in k_values:
    # Select the top 'k' normalized eigenvectors
    selected_eigenvectors_Q_matrix = normalized_eigenvectors[:, :k]
    F_prime1 = np.dot(selected_eigenvectors_Q_matrix.T, X_centered1.T).T

    # Compute the inverse of Q (Q^-1)
    Q_inverse = selected_eigenvectors_Q_matrix  # Because Q^-1 = Q^T for an orthogonal matrix

    reconstructed_F = np.dot(Q_inverse, F_prime1.T) + np.mean(X_centered1, axis=1, keepdims=True).T
    reconstructed_F = reconstructed_F.T
    # Compute the reconstruction error
    reconstruction_error = np.linalg.norm(X_centered1 - reconstructed_F, 'fro')

    # Update the best result if needed
    if reconstruction_error < best_reconstruction_error:
        best_reconstruction_error = reconstruction_error
        best_Q_matrix = selected_eigenvectors_Q_matrix
        best_k = k
        print(f'k:{k}, reconstruction_error:{reconstruction_error}')

# Report the best Q matrix, best k, and reconstruction error
print("Best Q matrix (for the best k={0}):".format(best_k))
#print(best_Q_matrix)

print("\nReconstruction Error with the Best Q matrix:")
print(best_reconstruction_error)

In [None]:
best_Q_matrix.shape

In [None]:
# Specify the number of components to retain
k = 21  # Set the desired number of components here
# Select the top 'k' normalized eigenvectors
selected_eigenvectors_Q_matrix = normalized_eigenvectors[:, :k]
# Step 3: Compute the transformed matrix Q^T * F_c
F_prime1 = np.dot(selected_eigenvectors_Q_matrix.T, X_centered1.T).T
# F_prime now contains the transformed data

In [None]:
F_prime1.shape

In [None]:
Q_inverse = selected_eigenvectors_Q_matrix  # Because Q^-1 = Q^T for an orthogonal matrix

# Step 2: Reconstruct the original matrix F
reconstructed_F = np.dot(Q_inverse, F_prime1.T) + np.mean(X_centered1, axis=1, keepdims=True).T
reconstructed_F = reconstructed_F.T
# reconstructed_F now contains the reconstructed original matrix

In [None]:
Q_inverse.shape

In [None]:
reconstructed_F.shape

# Fuzzy C-Means Clustering after Applying PCA with Best Q_matrix

In [None]:
num_clusters = 2
fuzziness = 2
membership_matrix, centroids = fuzzy_c_means(F_prime1, num_clusters, fuzziness)

In [None]:
# Final Membership Matrix:
membership_matrix

In [None]:
# Final Centroids:
centroids

In [None]:
centroids.shape

In [None]:
membership_matrix.shape

# data2

In [None]:
data2 = pd.read_csv(Path2)
data2

In [None]:
data2 = data2.drop(columns=['Type'])

In [None]:
data2.dropna(inplace=True)

In [None]:
# check the correlation between features
plt.figure(figsize=(20, 20))
sns.heatmap(data2.corr(), annot=True, cmap='coolwarm')

In [None]:
data2.isnull().sum()

In [None]:
data2.shape

In [None]:
data2 = data2.astype(np.float32)

In [None]:
data2 = np.array(data2)

In [None]:
scaler = MinMaxScaler()
X_centered2 = scaler.fit_transform(data2)

In [None]:
num_clusters = 7
fuzziness = 2
membership_matrix, centroids = fuzzy_c_means(data2, num_clusters, fuzziness)

In [None]:
# Final Membership Matrix:
membership_matrix

In [None]:
# Final Centroids:
centroids

In [None]:
membership_matrix.shape, centroids.shape

# Applying PCA from Scratch

In [None]:
covariance_matrix = np.dot(X_centered2.T, X_centered2) / len(X_centered2)

In [None]:
plt.scatter(X_centered2[:, 0], X_centered2[:, 1])

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
eigenvalues.max()

In [None]:
eigenvectors[eigenvalues.argmax()]

In [None]:
# Normalize eigenvectors
#normalized_eigenvectors = eigenvectors / np.linalg.norm(eigenvectors, axis=0)
normalized_eigenvectors = eigenvectors / np.sqrt(np.sum(eigenvectors ** 2, axis=0))

# Construct matrix Q
Q = normalized_eigenvectors

# Print the resulting matrix Q
print(f"Matrix Q:\n{Q.shape}")

In [None]:
X_transformed = transform_data(X_centered2, Q, X_centered2.shape[1])

In [None]:
X_transformed.shape

In [None]:
plt.scatter(X_transformed[:, 0], X_transformed[:, 1])

In [None]:
X_reconstructed = inverse_transform(X_transformed, Q, X_transformed.shape[1])

In [None]:
X_reconstructed.shape

In [None]:
plt.scatter(X_reconstructed[:, 0], X_reconstructed[:, 1])

In [None]:
k_values = range(1, data2.shape[1])

# Initialize variables for tracking the best result
best_reconstruction_error = np.inf
best_Q_matrix = None
best_k = None

# Experiment with different numbers of retained components
for k in k_values:
    # Select the top 'k' normalized eigenvectors
    selected_eigenvectors_Q_matrix = normalized_eigenvectors[:, :k]
    F_prime2 = np.dot(selected_eigenvectors_Q_matrix.T, X_centered2.T).T

    # Compute the inverse of Q (Q^-1)
    Q_inverse = selected_eigenvectors_Q_matrix  # Because Q^-1 = Q^T for an orthogonal matrix

    reconstructed_F = np.dot(Q_inverse, F_prime2.T) + np.mean(X_centered2, axis=1, keepdims=True).T
    reconstructed_F = reconstructed_F.T
    # Compute the reconstruction error
    reconstruction_error = np.linalg.norm(X_centered2 - reconstructed_F, 'fro')

    # Update the best result if needed
    if reconstruction_error < best_reconstruction_error:
        best_reconstruction_error = reconstruction_error
        best_Q_matrix = selected_eigenvectors_Q_matrix
        best_k = k
        print(f'k:{k}, reconstruction_error:{reconstruction_error}')

# Report the best Q matrix, best k, and reconstruction error
print("Best Q matrix (for the best k={0}):".format(best_k))
#print(best_Q_matrix)

print("\nReconstruction Error with the Best Q matrix:")
print(best_reconstruction_error)

In [None]:
best_Q_matrix.shape

In [None]:
# Specify the number of components to retain
k = 8  # Set the desired number of components here

# Select the top 'k' normalized eigenvectors
selected_eigenvectors_Q_matrix = normalized_eigenvectors[:, :k]

# Step 3: Compute the transformed matrix Q^T * F_c
F_prime2 = np.dot(selected_eigenvectors_Q_matrix.T, X_centered2.T).T

In [None]:
F_prime2.shape  # number of samples x reduced festure

In [None]:
Q_inverse = selected_eigenvectors_Q_matrix  # Because Q^-1 = Q^T for an orthogonal matrix

# Step 2: Reconstruct the original matrix F
reconstructed_F = np.dot(Q_inverse, F_prime2.T) + np.mean(X_centered2, axis=1, keepdims=True).T
reconstructed_F = reconstructed_F.T

In [None]:
Q_inverse.shape

In [None]:
reconstructed_F.shape

In [None]:
num_clusters = 7
fuzziness = 2
membership_matrix, centroids = fuzzy_c_means(F_prime2, num_clusters, fuzziness)

In [None]:
# Final Membership Matrix:
membership_matrix

In [None]:
# Final Centroids:
centroids

In [None]:
membership_matrix.shape, centroids.shape


# Report for Phase 1
- Input Data is 22 features and 319,795 samples
- The Output of covariance matrix is 22x22 features (We used two different methods to compute the covariance matrix, and we obtained the same results)
- eigenvalues are 22 eigenvalues and eigenvectors are 22x22 features (We used the built-in method and from scratch method to compute the eigenvalues and eigenvectors, and we obtained the same results)
- We created the matrix Q by sorting the normalized eigenvectors in descending order of eigenvalues.
- We tried different numbers of components (k), and the best result is 21. Because the reconstruction error is the lowest.
- Best reconstruction error 1625.209 (From 5171.140 to 1625.209)
- We transformed the original matrix [ F' = Q (F - m)] to the transformed matrix and reconstructed the transformed matrix (inverse) [F = (Q - 1 * F') + m].
- We truncated matrix Q (take some values) from Q, then the new F length is smaller than old (Reduce dimensionality).
- The code iterates over different numbers of retained components (k_values) and reconstructs the data using the selected principal components. The reconstruction error is computed for each iteration.
- Best Results The best results are reported based on the minimum reconstruction error. The optimal number of retained components (best_k), the corresponding Q matrix (best_Q_matrix), and the minimum reconstruction error are printed.
- This report provides insights into the analysis, including the sorted eigenvalues and eigenvectors, the normalization of eigenvectors, and the determination of the best reconstruction using PCA. The output will show the best Q matrix, the optimal number of retained components, and the associated reconstruction error.
---


* # Report for phase 2


* "When employing PCA in conjunction with Fuzzy C-means clustering, it appears that the algorithm converges more efficiently, displaying a notable improvement in convergence quality compared to using the entire (all) dataset. This observation is particularly evident when the dataset is smaller and simpler than our own."



 


* Fuzzy C-mean (FCM): 
* Dividing the data points to set of clusters using membership function between each point in the dataset and all clusters centroids.
* A particular data of the set may be the member of several clusters with different values of membership.
* Input for FCM: Data , C (number of clusters)
* Output for FCM: Mij(Membership matrix) , Cj (cluster centroid) [1 <=j<= C], [1<=i<=n]
  

* steps for Fuzzy C-mean (FCM):
1. Assume the number of clusters to be made C. Such that: 2<=C<=N (N: # of samples)
2. Choose an appropriate level of cluster fuzziness g  Such that: g > 1
3. Initialize the NxC sized membership matrix [M] at random such that: (a) Mij∈ [0.0 ,1.0] and there sum should be 1
4. Compute centroids
5. Calculate the Euclidean distance between each data point i-th and j-th cluster center dij
6. Update fuzzy membership matrix[M] according to dij
7. Repeat until the changes in [M] come out to be less than some pre-specified values.


 






## We used two datasets
### the first one has a low correlation, so the updated membership matrix was converging very slowly (more iterations).
### the second one has a high relative correlation, so the updated membership matrix was converging relatively fast.

Team Member: Abdelrahman Amin (20200311) / Mohamed Hisham (20200483) / Abdelrahman Mostafa (20200827) / Yossef Mohamed (20200669) / Esraa Abdelmoneam (20201015)