In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import plotly.express as px

Во прилог ќе бидат практично применети концептите за намалување на димензионалноста со помош на Principal Component Analysis (PCA).

# Дефинирање на податочното множество

Податочното множество кое ќе биде користено е Heart Disease UCI, достапно на следниот [линк](https://www.kaggle.com/ronitf/heart-disease-uci). Станува збор за податочно множество со `14` променливи(карактеристики) па векторите на нашите податоци се од `14` димензионален векторски простор.

Наша цел е да ги визуелизираме овие податоци, со цел увидеме дали постојат повеќе кластери на испитаници. Но, за да направиме визуелизација мора да имаме најмногу `3` димензии.

In [2]:
dataset = pd.read_csv('datasets/heart.csv')
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Во нашето податочно множество, немаме променливи(карактеристики) за кои недостасуваат вредности за одредени примероци.

In [3]:
dataset.isna().any().any()

False

Сите вредности се нумерички, па нема потреба од нивно енкодирање.

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


Податоците претставени како матрица:

In [5]:
dataset.values

array([[63.,  1.,  3., ...,  0.,  1.,  1.],
       [37.,  1.,  2., ...,  0.,  2.,  1.],
       [41.,  0.,  1., ...,  0.,  2.,  1.],
       ...,
       [68.,  1.,  0., ...,  2.,  3.,  0.],
       [57.,  1.,  0., ...,  1.,  3.,  0.],
       [57.,  0.,  1., ...,  1.,  2.,  0.]])

# Чекори од Principal Component Analysis

## Чекор 1: Стандардизација

In [6]:
scaler = StandardScaler()

standardized_dataset = pd.DataFrame(scaler.fit_transform(dataset.values))
standardized_dataset.columns = dataset.columns

In [7]:
standardized_dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.952197,0.681005,1.973123,0.763956,-0.256334,2.394438,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873,0.914529
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,-0.417635,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922,0.914529
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-0.417635,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922,0.914529
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,-0.417635,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922,0.914529
4,0.290464,-1.468418,-0.938515,-0.663867,2.08205,-0.417635,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922,0.914529


## Чекор 2: Пресметка на матрица на коваријанси

In [8]:
standardized_dataset.cov()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
age,1.003311,-0.098773,-0.06888,0.280276,0.214385,0.121709,-0.116596,-0.399842,0.097121,0.210708,-0.169373,0.277241,0.068227,-0.226185
sex,-0.098773,1.003311,-0.049516,-0.056957,-0.198568,0.045181,-0.058389,-0.044166,0.142133,0.096411,-0.030812,0.118653,0.210737,-0.281867
cp,-0.06888,-0.049516,1.003311,0.047765,-0.077159,0.094757,0.044568,0.296741,-0.395586,-0.149724,0.120113,-0.181653,-0.162271,0.435235
trestbps,0.280276,-0.056957,0.047765,1.003311,0.123582,0.178118,-0.114481,-0.046852,0.06784,0.193856,-0.121877,0.101725,0.062416,-0.145411
chol,0.214385,-0.198568,-0.077159,0.123582,1.003311,0.013338,-0.15154,-0.009973,0.067245,0.054131,-0.004051,0.070744,0.09913,-0.085521
fbs,0.121709,0.045181,0.094757,0.178118,0.013338,1.003311,-0.084468,-0.008595,0.02575,0.005766,-0.060093,0.138436,-0.032125,-0.028139
restecg,-0.116596,-0.058389,0.044568,-0.114481,-0.15154,-0.084468,1.003311,0.04427,-0.070967,-0.058965,0.093353,-0.072281,-0.012021,0.137684
thalach,-0.399842,-0.044166,0.296741,-0.046852,-0.009973,-0.008595,0.04427,1.003311,-0.380066,-0.345327,0.388065,-0.213883,-0.096758,0.423137
exang,0.097121,0.142133,-0.395586,0.06784,0.067245,0.02575,-0.070967,-0.380066,1.003311,0.289177,-0.258602,0.116123,0.207438,-0.438203
oldpeak,0.210708,0.096411,-0.149724,0.193856,0.054131,0.005766,-0.058965,-0.345327,0.289177,1.003311,-0.579449,0.22342,0.21094,-0.432122


In [9]:
cov_matrix = np.matrix(standardized_dataset.cov().to_numpy())

## Чекор 3: Пресметка на сопствени вредности и вектори матрицата на ковариjанси

За пресметка на сопствените вредности ќе користиме `np.linalg`, и основна функција за нивно сортирање.

In [10]:
eigen_values = -np.sort(-np.linalg.eigvals(cov_matrix))

In [11]:
eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)

idx = eigen_values.argsort()[::-1]   
eigen_values = eigen_values[idx]
eigen_vectors = eigen_vectors[:,idx]

## Чекор 4: Избор на компоненти и добивање на feature вектор

Bo Чекор 4, и Чекор 5, меѓу-резултантните матрици се веќе транспонирани поради defaults, па за добивање на резултатот ќе ја кориситиме следната линеарна трансформација


$FinalDataSet = (FeatureVector \times StandardizedDataSet^T)^T$ 


Како што беше дискутирано претходно, за одбирање на компоненти имаме две опции, и тоа:
    
- дефинирање праг на значајност
    
- априори дефинирање на димензија до која сакаме да ги намалиме податоците

Во овој пример априори ќе ја дефинираме димензијата, и ќе одбереме таа да биде `3`, бидејќи крајната цел е да ги визуелизираме податоците.

In [12]:
TARGET_DIMENSION = 3
feature_vector = eigen_vectors[:TARGET_DIMENSION]

In [13]:
feature_vector.shape

(3, 14)

## Чекор 5: Намалување на димензиjата

In [14]:
final_dataset = np.transpose(np.matmul(feature_vector, np.transpose(standardized_dataset)))

In [15]:
final_dataset

Unnamed: 0,0,1,2
0,-0.286687,0.110460,-1.914001
1,-0.467832,0.684439,-0.893856
2,-1.265855,0.513491,0.352265
3,-0.385241,-0.656087,-0.169573
4,-1.396762,1.160139,0.477217
...,...,...,...
298,-0.416647,0.652530,0.303550
299,-0.376835,-2.061995,-0.526859
300,1.121740,1.366236,-0.713181
301,1.648491,0.125585,0.280630


# Визуелизација на податоците

Сега нашите податоци се со помала димензија, односно векторите кои ги репрезентираат се од `3` димензионален векторски простор, па истите можеме да ги визуелизираме.

In [17]:
fig = px.scatter_3d(final_dataset, x=0, y=1, z=2, 
                    title="Визуелизација на податоците во 3Д простор",
                    labels={str(i):f"PC: {i}" for i in range(0,TARGET_DIMENSION)},
                    width=800, 
                    height=800
                   )
fig.show()