# Dimensionality Reduction

## PCA (Principal Component Analysis)

In [2]:
# Data prep
import pandas as pd

df = pd.read_csv('Data/entertainment_clean.csv')
df.head()

Unnamed: 0,name,books,tv_shows,video_games
0,Aaliyah,0.5,4.6,4.9
1,Abigail,0.0,4.5,4.8
2,Addison,0.5,4.5,5.0
3,Adeline,3.5,4.5,6.6
4,Alana,2.8,3.8,5.6


In [4]:
data = df.iloc[:, 1:]
data.head()

Unnamed: 0,books,tv_shows,video_games
0,0.5,4.6,4.9
1,0.0,4.5,4.8
2,0.5,4.5,5.0
3,3.5,4.5,6.6
4,2.8,3.8,5.6


In [6]:
data.dtypes # all numeric!

books          float64
tv_shows       float64
video_games    float64
dtype: object

In [8]:
data.mean()

books          2.993333
tv_shows       4.586000
video_games    5.843333
dtype: float64

In [10]:
df_centered = data - data.mean()
df_centered.head()

Unnamed: 0,books,tv_shows,video_games
0,-2.493333,0.014,-0.943333
1,-2.993333,-0.086,-1.043333
2,-2.493333,-0.086,-0.843333
3,0.506667,-0.086,0.756667
4,-0.193333,-0.786,-0.243333


In [12]:
df_centered.mean() # Very small number close to 0. This means our center logic works.

books          1.287859e-16
tv_shows      -3.434290e-16
video_games   -3.730349e-16
dtype: float64

In [None]:
# Model

In [13]:
from sklearn.decomposition import PCA

In [14]:
pca = PCA(n_components=2)
pca.fit(df_centered)

In [16]:
pca.explained_variance_ratio_ # First component captured 88% of the variance in the data, Second comp 0.08%.

array([0.88175186, 0.08603611])

In [17]:
# What would happen if we set n_components to be equal to the same number of columns?
pca3 = PCA(n_components=3)
pca3.fit(df_centered)

In [18]:
pca3.explained_variance_ratio_

array([0.88175186, 0.08603611, 0.03221203])

In [19]:
sum(pca3.explained_variance_ratio_) # gives 100% of captured variance data.

1.0