In [1]:
# Read the textfile ./data/olympics.csv into a DataFrame using the first 
# column as index. The data lists the individual performances of 33 male 
# athlets during various competitions of the 1988 Olympic summer games.

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

OLYMP = "01_Data/olympics.csv"
olympics = pd.read_csv(OLYMP, sep=",")
olympics = olympics.set_index("id")

In [2]:
# Print summary statistics for each of the variables.

olympics.describe()

Unnamed: 0,100,long,poid,haut,400,110,disq,perc,jave,1500,score
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,11.196364,7.133333,13.976364,1.982727,49.276667,15.048788,42.353939,4.739394,59.438788,276.038485,7856.909091
std,0.243321,0.30434,1.331991,0.093984,1.06966,0.506765,3.719131,0.334421,5.495998,13.657098,415.069449
min,10.62,6.22,10.27,1.79,47.44,14.18,34.36,4.0,49.52,256.64,6907.0
25%,11.02,7.0,13.15,1.94,48.34,14.72,39.08,4.6,55.42,266.42,7579.0
50%,11.18,7.09,14.12,1.97,49.15,15.0,42.32,4.7,59.48,272.06,7869.0
75%,11.43,7.37,14.97,2.03,49.98,15.38,44.8,4.9,64.0,286.04,8180.0
max,11.57,7.72,16.6,2.27,51.28,16.2,50.66,5.7,72.6,303.17,8488.0


In [4]:
# Scale the data such that all variables have unit variance.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
olympics_scaled = scaler.fit_transform(olympics)

print(olympics_scaled)

[[ 2.23852371e-01  9.89900415e-01  1.14636694e+00  3.10401192e+00
  -3.57596564e-01  1.62740646e-01  1.89115343e+00 -1.19624023e-01
   3.47594704e-01 -5.27080543e-01  1.54402076e+00]
 [-1.36208476e+00  1.05663527e+00  7.57544779e-01 -1.37519515e-01
  -1.48734854e+00 -1.17986969e+00  5.47752700e-01  1.09501990e+00
   4.28894239e-01 -2.24446362e-01  1.32627425e+00]
 [-6.82939435e-02  1.02326784e+00  1.70499557e-01 -1.37519515e-01
  -9.36713125e-01 -4.78506080e-01  3.56618450e-01  1.39868088e+00
   8.72346245e-01 -9.54634977e-01  1.15256635e+00]
 [-2.40546446e+00  8.23063266e-01  7.95664598e-01  5.10786772e-01
  -2.05697139e-01 -6.58856722e-01  6.67894229e-01  4.87697940e-01
   8.50173645e-01  6.74533308e-01  1.09874137e+00]
 [-7.36056947e-01  9.89900415e-01 -8.05367825e-01 -1.37519515e-01
  -1.74367882e+00 -1.30010345e+00 -3.15081916e-01  1.39868088e+00
  -3.65623940e-01 -1.44241882e+00  1.04980957e+00]
 [-1.52902551e+00  1.95755588e+00 -3.02186206e-01  1.48324620e+00
  -8.89244554e-01 -

In [None]:
# Plain PCA model

In [14]:
# Fit a plain vanilla PCA model. Store the components in a DataFrame to 
# display the loadings of each variable. 

from sklearn.decomposition import PCA

pca = PCA()
pca.fit(olympics_scaled)

components = pd.DataFrame(pca.components_, columns=[olympics.columns])
components.index = components.index + 1
components

Unnamed: 0,100,long,poid,haut,400,110,disq,perc,jave,1500,score
1,0.346064,-0.337322,-0.270135,-0.202754,0.287417,0.368449,-0.190384,-0.347249,-0.19373,0.134657,-0.470656
2,0.192924,-0.191686,0.452324,0.008518,0.387101,0.115769,0.481851,0.108408,0.352748,0.433087,0.046789
3,0.314928,0.15211,-0.115401,0.769805,0.195467,0.211871,-0.070436,-0.180399,0.236528,-0.281668,0.126023
4,0.068789,0.198465,0.106801,-0.508857,-0.097951,0.377425,-0.027672,-0.133913,0.555093,-0.451356,0.05427
5,-0.445656,0.406549,-0.0004,0.023586,0.147905,-0.100368,0.012758,-0.686312,0.135721,0.328518,-0.077916
6,-0.015896,0.10214,-0.22193,-0.062917,0.317502,-0.20872,-0.607363,0.382803,0.442638,0.280878,-0.038501
7,-0.257611,-0.752789,0.100308,0.124329,-0.130074,-0.272074,-0.156563,-0.283873,0.339357,-0.171121,0.030685
8,-0.663259,-0.141693,-0.074276,0.153392,0.148502,0.63984,-0.010675,0.275171,-0.060114,-0.005217,0.006552
9,-0.108243,0.046021,0.422144,-0.10245,0.65104,-0.207127,-0.167463,-0.01801,-0.306483,-0.456516,0.001266
10,0.103916,0.063011,0.659961,0.13107,-0.34474,0.252511,-0.52555,-0.054293,-0.120598,0.232125,-0.048154


In [17]:
# Which variables load most prominently on the first component? Which ones 
# on the second? Which ones on the third? How would you thus interpret 
# those components?

components.max(axis=1)

# Variables that load most prominently: 
    # first component: 110
    # second: disq
    # third: haut
    
# sns.heatmap(pca.components_, cmap="viridis")

1     0.368449
2     0.481851
3     0.769805
4     0.555093
5     0.406549
6     0.442638
7     0.339357
8     0.639840
9     0.651040
10    0.659961
11    0.864065
dtype: float64

In [10]:
# How many components do you need to explain at least 90% of the data? 
# (Hint: use np.cumsum() for this.)

var = pd.DataFrame(pca.explained_variance_ratio_, 
                   columns=["Explained Variance"])
var.index = var.index + 1 
var["Cum. explained variance"] = var["Explained Variance"].cumsum()
var # to explain at leat 90% of the data, you need 7 components.

Unnamed: 0,Explained Variance,Cum. explained variance
1,0.39727,0.39727
2,0.238064,0.635334
3,0.087554,0.722888
4,0.080244,0.803132
5,0.051033,0.854165
6,0.044755,0.898921
7,0.039199,0.938119
8,0.027892,0.966012
9,0.024268,0.99028
10,0.009287,0.999567
