### Data import and preparation

In [2]:
import sklearn
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
#load data and merge both tables to one, ignore_index to reindex
wine = pd.read_csv('winequality-red(1).csv')

In [3]:
pca = PCA()
pca.fit(wine)
print("Principal Components PC:")
print(pca.components_[:1]) 
print("Percent of Variance each PC accounts for:")
print(pca.explained_variance_ratio_) 





Principal Components PC:
[[-6.13296554e-03  3.84670318e-04  1.70762384e-04  8.64864277e-03
   6.37476516e-05  2.18852809e-01  9.75669835e-01  3.72590009e-06
  -2.67974074e-04  2.23244233e-04 -6.35985376e-03 -4.31953676e-03]]
Percent of Variance each PC accounts for:
[9.46079514e-01 4.83483474e-02 2.59599087e-03 1.52288677e-03
 1.04910404e-03 3.34987986e-04 3.10082440e-05 1.93440109e-05
 9.43889374e-06 8.17881393e-06 1.19899823e-06 4.68306222e-10]


In [75]:
# see https://stackoverflow.com/questions/23294616/how-to-use-scikit-learn-pca-for-features-reduction-and-know-which-features-are-d
pca = PCA(n_components=4)
pca.fit(wine)
print(pca.components_)

[[-6.13296554e-03  3.84670318e-04  1.70762384e-04  8.64864277e-03
   6.37476516e-05  2.18852809e-01  9.75669835e-01  3.72590009e-06
  -2.67974074e-04  2.23244233e-04 -6.35985376e-03 -4.31953676e-03]
 [-2.38646792e-02 -2.02021707e-03 -3.02675912e-03  1.11453593e-02
  -2.37525597e-04  9.75212313e-01 -2.18850408e-01 -2.50439091e-05
   3.26939011e-03  6.25945868e-04  1.46377527e-02  1.15350784e-02]
 [ 9.51200639e-01 -2.62402333e-02  7.42538455e-02  2.81876995e-01
   2.80220910e-03  1.99910682e-02 -9.82959404e-04  7.64213504e-04
  -5.84500955e-02  1.82329377e-02 -3.65813011e-02  6.37528799e-02]
 [-2.69339189e-01  4.60102384e-03 -8.47513677e-03  9.31286622e-01
  -1.05332218e-03 -2.23601147e-02 -3.08989163e-03  1.69393411e-05
   2.11467777e-02 -5.50574423e-03  2.31642587e-01  7.36356689e-02]]


In [72]:
# first take absolute values and then normalize this data to get a better overview
comp1 = sklearn.preprocessing.scale(np.absolute(pca.components_[0]))
comp2 = sklearn.preprocessing.scale(np.absolute(pca.components_[1]))
comp3 = sklearn.preprocessing.scale(np.absolute(pca.components_[2]))
comp4 = sklearn.preprocessing.scale(np.absolute(pca.components_[3]))

df = pandas.DataFrame([comp1,comp2, comp3, comp4])
print(df.describe().drop(['count', '25%', '50%', '75%']).round(2))

        0     1     2     3     4     5     6     7     8     9     10    11
mean  0.77 -0.41 -0.36  0.75 -0.44  0.71  0.67 -0.44 -0.36 -0.42 -0.16 -0.30
std   1.66  0.05  0.11  1.63  0.06  1.73  1.77  0.07  0.07  0.05  0.37  0.07
min  -0.35 -0.49 -0.48 -0.35 -0.51 -0.42 -0.50 -0.51 -0.43 -0.49 -0.35 -0.36
max   3.18 -0.38 -0.21  3.11 -0.38  3.24  3.23 -0.38 -0.27 -0.38  0.39 -0.22


It can therefore be argued, that the components 4, 7 and 10 can be dropped (index begins at 0). A look at the head of the data shows us which feature those represent:

In [76]:
wine.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [77]:
print(abs( pca.components_ ))

[[6.13296554e-03 3.84670318e-04 1.70762384e-04 8.64864277e-03
  6.37476516e-05 2.18852809e-01 9.75669835e-01 3.72590009e-06
  2.67974074e-04 2.23244233e-04 6.35985376e-03 4.31953676e-03]
 [2.38646792e-02 2.02021707e-03 3.02675912e-03 1.11453593e-02
  2.37525597e-04 9.75212313e-01 2.18850408e-01 2.50439091e-05
  3.26939011e-03 6.25945868e-04 1.46377527e-02 1.15350784e-02]
 [9.51200639e-01 2.62402333e-02 7.42538455e-02 2.81876995e-01
  2.80220910e-03 1.99910682e-02 9.82959404e-04 7.64213504e-04
  5.84500955e-02 1.82329377e-02 3.65813011e-02 6.37528799e-02]
 [2.69339189e-01 4.60102384e-03 8.47513677e-03 9.31286622e-01
  1.05332218e-03 2.23601147e-02 3.08989163e-03 1.69393411e-05
  2.11467777e-02 5.50574423e-03 2.31642587e-01 7.36356689e-02]]


In [78]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
principalComponents = pca.fit_transform(wine)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2','principal component 3', 'principal component 4'])

In [79]:
finalDf = pd.concat([principalDf])

In [80]:
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4
0,-13.222027,-2.031922,-1.181235,-0.475642
1,22.040255,4.401791,-0.354991,-0.260239
2,7.165362,-2.508321,-0.624638,-0.275306
3,13.428369,-1.946032,2.65026,-1.561525
4,-13.222027,-2.031922,-1.181235,-0.475642
