---

**Load essential libraries**

---

In [None]:
import pandas as pd
import numpy as np
import sys
from scipy import linalg

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.datasets import make_blobs

from scipy.sparse import random
from scipy import stats

---

**The following code cell mounts the Google Drive folder for accessing data etc.**

---

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    # Change path below starting from /content/drive/MyDrive/Colab Notebooks/
    # depending on how data is organized inside your Colab Notebooks folder in
    # Google Drive
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/Workshops/Applied Machine Learning Workshop Canara Engineering College_August2023'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

**Data Matrix**

Patient dataset corresponding to 4 patients and 3 features:

![Patient dataset](https://bl3302files.storage.live.com/y4mlspYO-L_1kEGpBOCUilkrcj3evQtgjGXDt6v2NgJwtsJf2OZVnwRnUht7CmW_wk8VMlMyGfhDqgRubB3pLHXAOe3r-pQ5wtYUuOqR_gsZzHWCqE2IEbhBjUZob5suLplmONyMsAjr1twDPK7eGODrKyav1dP1aX3lWx1YV0hiLvuTEZ7-GujIypTMkaSV2or?width=256&height=153&cropmode=none)

In [None]:
## Create patient data matrix
df_patient = pd.DataFrame({'HR' : [76, 74, 72, 78],
                           'BP' : [126, 120, 118, 136],
                           'Temp': [38, 38, 37.5, 37]})

print(df_patient)
#df_patient.shape
#df_patient.head()
print('----------')
A = np.array(df_patient)
print(A)


---

**Dot Product of Vectors**

A scalar resulting from an elementwise multiplication and addition: $$a{\color{cyan}\cdot}b = {\color{red}{a_1b_1}}+{\color{green}{a_2b_2}}+\cdots+{\color{magenta}{a_nb_n}}$$

The <font color="cyan">dot</font> ${\color{cyan}\cdot}$ represents the computation of the dot product.

---

In [None]:
# Unit vectors
e_1 = np.array([1, 0, 0, 0])
e_3 = np.array([0, 0, 1, 0])

# Ones vector
o = np.array([1, 1, 1, 1])

# Vector for 1st feature (heart rate)
a_1 = A[:, 0]

# Get 1st component of vector (heart rate for 1st patient)
print(a_1[0])
print(np.dot(a_1, e_1))
print(np.dot(e_1, a_1))

# Get 3rd component of vector (heart rate for 3rd patient)
print(np.dot(a_1, e_3))

# Get sum of 1st feature values (heart rate) for all patients
print(np.sum(a_1))
print(np.dot(a_1, o))

# Average of 1st feature values (heart rate)
print((1/len(a_1)) * np.dot(a_1, o))
print(np.mean(a_1))

---

**Cauchy-Schwarz inequality**

For any two $n$-vectors $a,b,$ it is always true that $$-1\leq\frac{a\cdot b}{\lVert a\rVert\lVert b\rVert}\leq 1.$$

This is used to define the cosine of the angle between the vectors $a$ and $b$ as follows:$$\cos(\angle(a,b)) = \frac{a\cdot b}{\lVert a\rVert\lVert b\rVert}.$$

---

In [None]:
## Simulate a patient dataset
#np.random.seed(100)
nsamples = 100
df = pd.DataFrame({'Heart Rate (BPM)' : np.round(np.random.normal(74, 4, nsamples)),
                   'Blood Pressure (mm Hg)' : np.round(np.random.normal(126, 8, nsamples))})
print(df)

In [None]:
# Vector of heart rate and blood pressure
a = df['Heart Rate (BPM)']
b = df['Blood Pressure (mm Hg)']
print(np.dot(a, b)) # Dot-product between the original vectors do not reveal much
print(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))) # Recall angle(a,b) acos(aT*b/(||a||*||b||))

# Mean-centered heart rate and blood pressure vectors
a_mc = a - np.mean(a)
b_mc = b - np.mean(b)
print(np.dot(a_mc, b_mc)) # Covariance between a and b
print(np.dot(a_mc, b_mc) / (np.linalg.norm(a_mc) * np.linalg.norm(b_mc))) # Correlation coefficient

fig, ax = plt.subplots(1)
ax.scatter(a, b, marker = 'o', color = 'blue')
ax.set_xlabel('Heart Rate (BPM)')
ax.set_ylabel('Blood Pressure (mm Hg')
ax.set_title('Heart Rate vs. Blood Pressure')
ax.set_xlim([0, np.max(a)])
ax.set_ylim([0, np.max(b)])

**Projection of vectors and its relationship to dot product**

![Vector projection](https://bl3302files.storage.live.com/y4miuCtKP9ptv6lIB8EqEU_u7cbEydy0UsEgHl4ECni2UVONtvKZgf73pIQ4vuA99ZHP8K96W_1i-QuhSIN12IudLaUTF3_jZzFqVfsaRK7QubMS9p5C1ErN6tB8I_UqQZnSY2JSGnu0IvJQrRcd2rX2Hzngfka3tCqJhbAMdElywcis2gRaoiuEGDVqaXpZYYp?width=256&height=209&cropmode=none)

In [None]:
np.mean((a-np.mean(a))**2)

In [None]:
np.sqrt(np.mean((a-np.mean(a))**2))

In [None]:
v=np.array([1,1,1])

In [None]:
print(np.dot(A[0, :], v) / np.linalg.norm(v))
print(np.dot(A[1, :], v) / np.linalg.norm(v))
print(np.dot(A[2, :], v) / np.linalg.norm(v))
print(np.dot(A[3, :], v) / np.linalg.norm(v))

In [None]:
np.dot(A, v)/np.linalg.norm(v)

In [None]:
## Scalar projection of first patient onto a direction v
v = np.array([1, 0, 0])
print(np.dot(A[0], v) / np.linalg.norm(v))
v = np.array([0, 1, 0])
print(np.dot(A[0], v) / np.linalg.norm(v))
v = np.array([1, 1, 1])
print(np.dot(A[0], v) / np.linalg.norm(v))
v = np.array([-1, -1, -1])
print(np.dot(A[0], v) / np.linalg.norm(v))

In [None]:
## Scalar projection of all patients onto a direction v
v = np.array([1, 0, 0])
print((1/np.linalg.norm(v)) * np.dot(A, v))
v = np.array([0, 1, 0])
print((1/np.linalg.norm(v)) * np.dot(A, v))
v = np.array([1, 1, 1])
print((1/np.linalg.norm(v)) * np.dot(A, v))
v = np.array([-1, -1, -1])
print((1/np.linalg.norm(v)) * np.dot(A, v))

---

Load food-texture data

---

In [None]:
## Load data
FILE = DATA_DIR + 'food-texture.csv'
df_food = pd.read_csv(FILE, index_col = 0)
df_food.head()

In [None]:
## Print the names of the rows & columns in the dataframe
print(df_food.index)
print(df_food.columns)

In [None]:
## Get values in the 'Density' column
print(df_food['Density'])

In [None]:
## Get features for the sample B136
print(df_food.loc['B136', :])

In [None]:
## Get 'Oil' and 'Density' values for the
## samples B136 and B225
feature_names = ['Oil', 'Density']
sample_names = ['B136', 'B225']
df_food.loc[sample_names, feature_names]

In [None]:
df_food.dtypes

In [None]:
## Create a list of continuous and categorical column names
continuous_cols = ['Oil', 'Density', 'Fracture', 'Hardness']
categorical_cols = ['Crispy']

# Typecasting
df_food[categorical_cols] = df_food[categorical_cols].astype('category')
df_food[continuous_cols] = df_food[continuous_cols].astype('float64')

In [None]:
## Print dataframe column types
df_food.dtypes

---

Principal component analysis (PCA) of the dataset

---

In [None]:
np.array(df_food[continuous_cols])

In [None]:
X = np.array(df_food[continuous_cols])
X

In [None]:
# numpy array of data
X = np.array(df_food[continuous_cols])
# Cree and fit PCA object
pca = PCA(n_components = 1)
pca.fit(X)

# Print principal components
print(pca.components_)

# Print explained variance
print(pca.explained_variance_)

In [None]:
df_food[continuous_cols].columns

In [None]:
df_food['Density']

In [None]:
sc = StandardScaler()
X = sc.fit_transform(np.array(df_food[continuous_cols]))
print(X)

In [None]:
a = np.array(df_food['Density'])
print(np.mean(a))
print(np.std(a))
np.where((a-np.mean(a))/np.std(a) >= 1.5)

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (10, 6))
fig.tight_layout(pad = 4.0)
binwidth = 20
ax.hist(a, bins = np.arange(np.min(a), np.max(a) + binwidth, binwidth))
ax.set_xlabel('Oil Density', fontsize = 14)
ax.set_ylabel('Count', fontsize = 14)
ax.set_xticks(np.arange(np.min(a), np.max(a) + binwidth, binwidth));

In [None]:
# numpy array of data
sc = StandardScaler()
X = sc.fit_transform(np.array(df_food[continuous_cols]))
# Cree and fit PCA object
pca = PCA(n_components = 1)
pca.fit(X)

# Print principal components
print(pca.components_)

# Print explained variance
print(pca.explained_variance_)

In [None]:
v = np.array(pca.components_).flatten()
print(v)
print(X[0, :])
np.dot(X[0, :], v)
np.dot(X, v)

In [None]:
# numpy array of data
sc = StandardScaler()
X = sc.fit_transform(np.array(df_food[continuous_cols]))
# Create and fit PCA object
pca = PCA(n_components = 2)
pca.fit(X)

# Print principal components
print(pca.components_)

# Print explained variance
print(pca.explained_variance_)

In [None]:
print(pca.explained_variance_)
print(np.cumsum(pca.explained_variance_))
print(2.29331669/4.08163265)
print(3.53112429/4.08163265)
print(3.83852725/4.08163265)

---

Following is a user-defined function for plotting 2D-data and the principal component vectors

---

In [None]:
def draw_vector(v0, v1, ax = None):
  ax = ax or plt.gca()
  arrowprops=dict(arrowstyle = '->',
  linewidth=2,
  shrinkA=0, shrinkB=0)
  ax.annotate('', v1, v0, arrowprops = arrowprops)

In [None]:
pca.mean_

In [None]:
## Plot data with principal component vectors
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
 v = vector * 3 * np.sqrt(length)
 draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');

In [None]:
np.dot(X, np.array(pca.components_))