# PCA (Principal Component Analysis)

### Necessary Imports

In [None]:
import pandas as pd
import os
from google.colab import drive
import pandas as pd
import numpy as np
import random
import copy
import matplotlib.pyplot as plt
import time
drive.mount('/content/drive')
MY_DRIVE_PATH = "/content/drive/MyDrive/MLProject_2"
DATA_FOLDER = os.path.join(MY_DRIVE_PATH, 'Data google sheet')
PROCESSED_CSV_FILE = os.path.join(DATA_FOLDER, 'Processed_Fruits_Data.csv')
ONEHOT_CSV_FILE = os.path.join(DATA_FOLDER, 'One_Hot_Processed_Fruits_Data.csv')
# Initialize
df = pd.read_csv(ONEHOT_CSV_FILE, sep = ";")
random.seed(42)

Mounted at /content/drive


### Normalizing and Removing Unnecessary Columns

In [None]:
# Preprocessing
# Removing unnecessary text columns
original_labels = df["Fruit"].values
df.drop(columns=["Image_path","Text","Label","Fruit"], inplace=True)

categories = ['banana', 'tomato', 'apple', 'orange', 'tangerine']
# Normalization
numerical_cols = ["Weight","Price"]
image_cols = [column for column in df.columns if "img" in column]

# Normalizing every image histogram
for idx in range(len(df)):
    hist_values = df.loc[idx, image_cols].values.astype(np.float64)
    total = hist_values.sum()
    if total > 0:
        df.loc[idx, image_cols] = hist_values / total
    else:
        df.loc[idx, image_cols] = 0

text_cols = [column for column in df.columns if "text" in column]
categorical_cols = [column for column in df.columns if (column not in numerical_cols + image_cols + text_cols) and (column != "Fruit")] # We don't want the target
columns_to_normalize = numerical_cols + image_cols + text_cols

epsilon = 1e-8  # To prevent division by zero
for column in columns_to_normalize:
    mean = df[column].mean()
    std = df[column].std()
    df[column] = (df[column] - mean) / (std + epsilon)

## Extracting Principal Components

We have applied the same steps as page 23 of the PCA slides. Those are :

1.   Normalizing the dataset.
> Already done in past cells.
2.   Finding the covariance matrix.
> This is done by np.cov(). We use rowvar = False since features are stored as columns.
3.   Computing eigenvectors
> np.linalg.eigh() gives this. We use eigh since we know that our matrix is symmetric (hermitian), therefore we can use this faster function.
4.   Use the first d eigenvectors to form d Principle Components.
> We sort according to eigenvalues and extract eigenvectors according to it.
5.   The transformation is row vector of these eigenvectors.
> Selecting first k columns in this example.

## Explained Variance and Reconstruction Error

For our dataset we chose PC count as 45 since it achieves explained variance of 83.7%.

We calculate explained variance with this formula:
$$\frac{\lambda_i}{\sum_{j=1}^{n} \lambda_j}$$
Where $\lambda_i$ is the eigenvalues, therefore variance of our PC's.

For reconstruction error, we multiply with transpose of the transformation vector, to get our data back. Mean squared error is 7.89, which is relatively low for 492 dimensions.




In [None]:
X = df.to_numpy() # Already normalized
covariance_arr = np.cov(X,rowvar = False)
eigenvalues, eigenvectors = np.linalg.eigh(covariance_arr) # eigh for faster and stable computation
sorted_indices = np.argsort(eigenvalues)[::-1] # Normally ascending order, we want descending
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:,sorted_indices] # Eigenvectors are stored as columns

k = 45 # Number of components we want
selected_eigenvectors = eigenvectors[:, :k]

# Projecting the data
PCA = X @ selected_eigenvectors

# Explained variance is our eigenvectors compared to all eigenvectors
explained_variance = np.sum(eigenvalues[:k]) / np.sum(eigenvalues)
print(f"Explained variance: {explained_variance}")

# Reconstruction error
reconstructed = PCA @ selected_eigenvectors.T
reconstruction_error = np.mean(np.linalg.norm(X - reconstructed, axis=1))
print(f"Reconstruction error: {reconstruction_error}")

Explained variance: 0.8374878590855579
Reconstruction error: 7.898349278239787


### Saving the File for Later Clustering

In [None]:
pca_columns = [f'PC{i}' for i in range(1, k + 1)]
df_pca = pd.DataFrame(data=PCA, columns=pca_columns)

# Re-attach the labels  saved in Step 1
df_pca['Fruit'] = original_labels

PCA_CSV_PATH = os.path.join(DATA_FOLDER, 'PCA_Processed_Fruits.csv')
df_pca.to_csv(PCA_CSV_PATH, index=False)
print("Saved PCA data with Fruit labels")

Saved PCA data with Fruit labels
