In [1]:
# Import libraries
using DataFrames
using LinearAlgebra
using Statistics
using XLSX
using Plots

In [3]:
# Download and extract the dataset
dataset_file = "DryBeanDataset/Dry_Bean_Dataset.xlsx"

if !isfile(dataset_file)
    run(`wget https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip`)
    run(`unzip DryBeanDataset.zip`)
    run(`rm DryBeanDataset.zip`)
end

In [None]:
# Import the dataset
sheet = "Dry_Beans_Dataset"
df = DataFrame(XLSX.readtable(dataset_file, sheet))
df

In [None]:
# Extracting data matrix
m = Matrix(df[:,begin:end-1])
n = size(m_standardized)[1]

In [None]:
# Centering and rescaling features
mean = Statistics.mean(m, dims=1)
std = Statistics.std(m, dims=1)
m_standardized = (m.-mean)./std

In [None]:
# Computing the covariance matrix
c = (transpose(m_standardized)*m_standardized)/n

# compute eigenvalues and eigenvectors
eig = eigen(c)
eigenvectors = reverse(eig.vectors)
eigenvalues = reverse(eig.values)

In [None]:
# Plotting the eigenvalue spectrum
plot(eigenvalues,
    seriestype=:scatter,
    label="Eigenvalues",
    xticks=1:length(eigenvalues),
    yscale=:log10,
    yticks=[10.0^i for i in ceil(log10(maximum(eigenvalues))):-1:floor(log10(minimum(eigenvalues)))],
    title = "Eigenvalue spectrum")

In [None]:
# Computing principal components of the dataset
pc = m_standardized*eigenvctors

In [None]:
# Plotting first two principal components
colors = ("blue", "crimson", "green", "gray", "purple", "orange", "cyan")

pc_plot = plot(
    pc[:, 1],
    pc[:, 2],
    alpha=.0,
    xlabel="PC1",
    ylabel="PC2",
    seriestype=:scatter,
    title = "First two PCs",
    label=nothing)

for (i, class) in enumerate(unique(df.Class))
    indexes = (1:n)[df.Class .== class]

    plot!(
        pc[indexes, 1],
        pc[indexes, 2],
        color=colors[i],
        label = class,
        seriestype=:scatter,
        alpha=0.8)
end

pc_plot

In [None]:
sum(pc[:,1] .* pc[:,2])
Statistics.mean(pc[:,10])