In [1]:
# loading packages

import os

import pandas as pd
import numpy as np
from numpy import linalg as LA

# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs

# PCA algorithm from scikit-learn
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

# Load raw data

In [2]:
# load raw data
raw = pd.read_csv('alldata.csv')

# check the raw data
print(raw.shape)

(29755, 11)


In [3]:
raw.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,RESULT
0,0,1,0,1,7,13000,0,10,1,34,0
1,1,0,0,0,7,19000,0,1,1,828,0
2,0,0,0,1,7,0,0,9,1,259,1
3,0,0,1,1,3,0,0,5,1,43259,0
4,0,0,0,1,7,10000,0,4,1,47,0


In [4]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29755 entries, 0 to 29754
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       29755 non-null  int64
 1   B       29755 non-null  int64
 2   C       29755 non-null  int64
 3   D       29755 non-null  int64
 4   E       29755 non-null  int64
 5   F       29755 non-null  int64
 6   G       29755 non-null  int64
 7   H       29755 non-null  int64
 8   I       29755 non-null  int64
 9   J       29755 non-null  int64
 10  RESULT  29755 non-null  int64
dtypes: int64(11)
memory usage: 2.5 MB


In [5]:
raw.drop(['RESULT'], axis=1, inplace=True)

# Simple exploratory analysis

# Print summary statistics

In [6]:
# print summary statistics
raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A,29755.0,0.042884,0.202598,0.0,0.0,0.0,0.0,1.0
B,29755.0,0.161754,0.368231,0.0,0.0,0.0,0.0,1.0
C,29755.0,0.469703,0.49909,0.0,0.0,0.0,1.0,1.0
D,29755.0,0.680255,0.466385,0.0,0.0,1.0,1.0,1.0
E,29755.0,6.075416,1.763556,1.0,6.0,7.0,7.0,7.0
F,29755.0,4678.214082,24161.156774,0.0,0.0,0.0,7000.0,1488000.0
G,29755.0,0.023962,0.152934,0.0,0.0,0.0,0.0,1.0
H,29755.0,7.159503,12.466791,0.0,1.0,3.0,8.0,374.0
I,29755.0,0.923576,0.26568,0.0,1.0,1.0,1.0,1.0
J,29755.0,17663.41882,21085.402086,0.0,37.0,599.0,43253.0,43260.0


# Normalization Feature Scaling 

In [7]:
X = raw[raw.columns]
X = (X - X.min()) / (X.max() - X.min())
X.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0.0,1.0,0.0,1.0,1.0,0.008737,0.0,0.026738,1.0,0.000786
1,1.0,0.0,0.0,0.0,1.0,0.012769,0.0,0.002674,1.0,0.01914
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.024064,1.0,0.005987
3,0.0,0.0,1.0,1.0,0.333333,0.0,0.0,0.013369,1.0,0.999977
4,0.0,0.0,0.0,1.0,1.0,0.00672,0.0,0.010695,1.0,0.001086


# Correlation matrix

In [8]:
a = X.corr()
a

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
A,1.0,0.025049,-0.05961,-0.077543,-0.107068,-0.003696,0.038425,-0.037518,-0.029024,0.077597
B,0.025049,1.0,-0.035785,-0.003732,0.029087,0.006799,-0.026457,0.099292,0.072084,-0.036248
C,-0.05961,-0.035785,1.0,-0.537593,0.23766,-0.002221,0.018538,0.202142,0.138925,-0.178639
D,-0.077543,-0.003732,-0.537593,1.0,-0.135887,0.010801,0.019309,0.044356,-0.053461,0.059704
E,-0.107068,0.029087,0.23766,-0.135887,1.0,0.051941,-0.026514,0.203232,0.283445,-0.330355
F,-0.003696,0.006799,-0.002221,0.010801,0.051941,1.0,-0.006523,0.014606,0.038609,-0.074969
G,0.038425,-0.026457,0.018538,0.019309,-0.026514,-0.006523,1.0,-0.028358,-0.089755,0.043139
H,-0.037518,0.099292,0.202142,0.044356,0.203232,0.014606,-0.028358,1.0,0.123974,-0.227114
I,-0.029024,0.072084,0.138925,-0.053461,0.283445,0.038609,-0.089755,0.123974,1.0,-0.320382
J,0.077597,-0.036248,-0.178639,0.059704,-0.330355,-0.074969,0.043139,-0.227114,-0.320382,1.0


# Eigenvectors

In [9]:
w, v = LA.eig([a["A"],a["B"],a["C"],a["D"],a["E"],a["F"],a["G"],a["H"],a["I"],a["J"]])
print("\nEigenvectors\n", v)


Eigenvectors
 [[ 1.05587063e-01  1.32007985e-01 -2.07432496e-01 -1.59468666e-01
   8.56263973e-03 -3.00327197e-01  5.35272846e-01 -6.53851266e-01
   2.39758898e-01  2.02914261e-01]
 [-7.00410241e-02  8.78468874e-02  1.96414553e-01 -5.70079571e-02
   7.69729998e-02  4.11633979e-01 -4.80996338e-01 -7.08778679e-01
  -8.85442983e-02 -1.69163928e-01]
 [-4.54601867e-01  6.89359134e-01 -4.84554218e-01  1.78341673e-01
  -4.53366853e-02 -1.03852787e-01 -1.72012851e-01  5.34295794e-02
  -1.27118268e-02 -7.79553462e-02]
 [ 3.15768243e-01  6.44373338e-01  6.45107729e-01 -7.25809217e-03
  -7.96617844e-02 -1.17108113e-01  1.05045891e-01  1.03494909e-01
   8.33372213e-02 -1.41793931e-01]
 [-4.65407975e-01  1.42334132e-02  1.40023091e-01 -7.96510238e-01
  -2.83093577e-01  1.29550907e-01  1.09953603e-01  1.19619960e-01
   6.91219275e-02 -3.24933909e-02]
 [-7.01494382e-02  8.08344065e-05  1.44434737e-01  4.62597222e-02
  -7.30380644e-02 -9.50114553e-02 -4.28564508e-01  4.02998469e-02
   5.72521142e-01 

# Eigenvalues

In [10]:
b = np.diag(w)
print("\nEigenvalues\n", b)


Eigenvalues
 [[2.05670533 0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.3901468  0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         1.36520262 0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.67620121 0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.6469815  0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.87173332
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.92994402 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         1.05253857 0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.00849219 0.        ]
 [0.         0.

# Variance of factor score

In [11]:
c=w
d,e,f,g,h,i,j,k,l,m = c[0],c[1],c[2],c[3],c[4],c[5],c[6],c[7],c[8],c[9]
print("Variance of factor score for",'A',"is:",d)
print("Variance of factor score for",'B',"is:",e)
print("Variance of factor score for",'C',"is:",f)
print("Variance of factor score for",'D',"is:",g)
print("Variance of factor score for",'E',"is:",h)
print("Variance of factor score for",'F',"is:",i)
print("Variance of factor score for",'G',"is:",j)
print("Variance of factor score for",'H',"is:",k)
print("Variance of factor score for",'I',"is:",l)
print("Variance of factor score for",'J',"is:",m)

Variance of factor score for A is: 2.056705330822235
Variance of factor score for B is: 0.3901468011554924
Variance of factor score for C is: 1.3652026175825926
Variance of factor score for D is: 0.6762012141410673
Variance of factor score for E is: 0.6469815048754679
Variance of factor score for F is: 0.8717333246902719
Variance of factor score for G is: 0.9299440159847557
Variance of factor score for H is: 1.052538566876447
Variance of factor score for I is: 1.00849218910476
Variance of factor score for J is: 1.002054434766907


# Percent of Variance

In [12]:
n,o,p,q,r,s,t,u,v,w = d/len(c),e/len(c),f/len(c),g/len(c),h/len(c),i/len(c),j/len(c),k/len(c),l/len(c),m/len(c)
print('A', "alone accounts for",'{:.2%}'.format(n),"of the variance")
print('B',"alone accounts for",'{:.2%}'.format(o),"of the variance")
print('C', "alone accounts for",'{:.2%}'.format(p),"of the variance")
print('D', "alone accounts for",'{:.2%}'.format(q),"of the variance")
print('E', "alone accounts for",'{:.2%}'.format(r),"of the variance")
print('F',"alone accounts for",'{:.2%}'.format(s),"of the variance")
print('G', "alone accounts for",'{:.2%}'.format(t),"of the variance")
print('H', "alone accounts for",'{:.2%}'.format(u),"of the variance")
print('I',"alone accounts for",'{:.2%}'.format(v),"of the variance")
print('J', "alone accounts for",'{:.2%}'.format(w),"of the variance")

A alone accounts for 20.57% of the variance
B alone accounts for 3.90% of the variance
C alone accounts for 13.65% of the variance
D alone accounts for 6.76% of the variance
E alone accounts for 6.47% of the variance
F alone accounts for 8.72% of the variance
G alone accounts for 9.30% of the variance
H alone accounts for 10.53% of the variance
I alone accounts for 10.08% of the variance
J alone accounts for 10.02% of the variance


In [13]:
a1 = float(n)
a2 = float(n)+float(o)
a3 = float(n)+float(o)+float(p)
a4 = float(n)+float(o)+float(p)+float(q)
a5 = float(n)+float(o)+float(p)+float(q)+float(r)
a6 = float(n)+float(o)+float(p)+float(q)+float(r)+float(s)
a7 = float(n)+float(o)+float(p)+float(q)+float(r)+float(s)+float(t)
a8 = float(n)+float(o)+float(p)+float(q)+float(r)+float(s)+float(t)+float(u)
a9 = float(n)+float(o)+float(p)+float(q)+float(r)+float(s)+float(t)+float(u)+float(v)
a10 = float(n)+float(o)+float(p)+float(q)+float(r)+float(s)+float(t)+float(u)+float(v)+float(w)
print('A',"accounts for",'{:.2%}'.format(a1),"of the variance")
print('A',"and",'B',"together account for over",'{:.2%}'.format(a2),"of the variance")
print('A','B',"and",'C',"together account for over",'{:.2%}'.format(a3),"of the variance")
print('A','B','C',"and",'D',"together account for exactly",'{:.2%}'.format(a4),"of the variance")
print('A','B','C','D',"and",'E',"together account for exactly",'{:.2%}'.format(a5),"of the variance")
print('A','B','C','D','E',"and",'F',"together account for exactly",'{:.2%}'.format(a6),"of the variance")
print('A','B','C','D,','E','F',"and",'G',"together account for exactly",'{:.2%}'.format(a7),"of the variance")
print('A','B','C','D','E','F','G',"and",'H',"together account for exactly",'{:.2%}'.format(a8),"of the variance")
print('A','B','C','D','E','F','G','H',"and",'I',"together account for exactly",'{:.2%}'.format(a9),"of the variance")
print('A','B','C','D','E','F','G','H','I',"and",'J',"together account for exactly",'{:.2%}'.format(a10),"of the variance")

A accounts for 20.57% of the variance
A and B together account for over 24.47% of the variance
A B and C together account for over 38.12% of the variance
A B C and D together account for exactly 44.88% of the variance
A B C D and E together account for exactly 51.35% of the variance
A B C D E and F together account for exactly 60.07% of the variance
A B C D, E F and G together account for exactly 69.37% of the variance
A B C D E F G and H together account for exactly 79.89% of the variance
A B C D E F G H and I together account for exactly 89.98% of the variance
A B C D E F G H I and J together account for exactly 100.00% of the variance


# The fast way to do PCA

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X1 = scaler.fit_transform(raw)
X_scaled = pd.DataFrame(X1, columns = [raw.columns])
X_scaled.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0.0,1.0,0.0,1.0,1.0,0.008737,0.0,0.026738,1.0,0.000786
1,1.0,0.0,0.0,0.0,1.0,0.012769,0.0,0.002674,1.0,0.01914
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.024064,1.0,0.005987
3,0.0,0.0,1.0,1.0,0.333333,0.0,0.0,0.013369,1.0,0.999977
4,0.0,0.0,0.0,1.0,1.0,0.00672,0.0,0.010695,1.0,0.001086


In [15]:
pca = PCA(n_components=10)
pca.fit(X_scaled)
rc = raw.columns
rp = pca.explained_variance_ratio_
rdata = {'Feature':rc, 'Variance Ratio':rp} 
df = pd.DataFrame(rdata)
df

Unnamed: 0,Feature,Variance Ratio
0,A,0.362737
1,B,0.228788
2,C,0.128536
3,D,0.098772
4,E,0.070212
5,F,0.051495
6,G,0.03675
7,H,0.021557
8,I,0.000907
9,J,0.000246
