In [1]:
# loading packages

import os

import pandas as pd
import numpy as np
from numpy import linalg as LA

# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs

# PCA algorithm from scikit-learn
from sklearn.decomposition import PCA

# Load raw data

In [2]:
# load raw data
DATA_FOLDER = './'
raw = pd.read_csv(os.path.join(DATA_FOLDER, 'countryriskdata.csv'))

# check the raw data
print("Size of the dataset (row, col): ", raw.shape)
print("\nFirst 5 rows\n", raw.head(n=5))

Size of the dataset (row, col):  (122, 6)

First 5 rows
      Country Abbrev  Corruption  Peace  Legal  GDP Growth
0    Albania     AL          39  1.867  3.822       3.403
1    Algeria     DZ          34  2.213  4.160       4.202
2  Argentina     AR          36  1.957  4.568      -2.298
3    Armenia     AM          33  2.218  4.126       0.208
4  Australia     AU          79  1.465  8.244       2.471


# Simple exploratory analysis

# Print summary statistics

In [3]:
# print summary statistics
print("\nSummary statistics\n", raw.describe())


Summary statistics
        Corruption       Peace       Legal  GDP Growth
count  122.000000  122.000000  122.000000  122.000000
mean    46.237705    2.003730    5.598861    2.372566
std     19.126397    0.447826    1.487328    3.241424
min     14.000000    1.192000    2.728000  -18.000000
25%     31.250000    1.684750    4.571750    1.432250
50%     40.000000    1.969000    5.274000    2.496000
75%     58.750000    2.280500    6.476750    4.080000
max     90.000000    3.399000    8.633000    7.958000


# Scaled Feature Values

In [4]:
X = raw[['Corruption','Peace', 'Legal', 'GDP Growth']]
X = (X - X.mean()) / X.std()
print(X.head(5))

   Corruption     Peace     Legal  GDP Growth
0   -0.378414 -0.305319 -1.194666    0.317896
1   -0.639833  0.467304 -0.967413    0.564392
2   -0.535266 -0.104348 -0.693096   -1.440899
3   -0.692117  0.478469 -0.990273   -0.667782
4    1.712936 -1.202990  1.778450    0.030368


# Correlation matrix

In [5]:
a = X.corr()
print("\nCorrelation matrix\n", a)


Correlation matrix
             Corruption     Peace     Legal  GDP Growth
Corruption    1.000000 -0.700477  0.923589    0.102513
Peace        -0.700477  1.000000 -0.651961   -0.199855
Legal         0.923589 -0.651961  1.000000    0.123440
GDP Growth    0.102513 -0.199855  0.123440    1.000000


# Eigenvectors

In [6]:
w, v = LA.eig([a["Corruption"],a["Peace"],a["Legal"],a["GDP Growth"]])
print("\nEigenvectors\n", v)


Eigenvectors
 [[ 0.59408948 -0.15518449 -0.73343125  0.29164029]
 [-0.53042352 -0.03987563 -0.08624644  0.84239087]
 [ 0.58502332 -0.13456211  0.6737212   0.43097622]
 [ 0.15315448  0.9778654  -0.02720102  0.13993949]]


# Eigenvalues

In [7]:
b = np.diag(w)
print("\nEigenvalues\n", b)


Eigenvalues
 [[2.56133208 0.         0.         0.        ]
 [0.         0.97489488 0.         0.        ]
 [0.         0.         0.07303268 0.        ]
 [0.         0.         0.         0.39074036]]


# Variance of factor score

In [8]:
c=w
d,e,f,g = c[0],c[1],c[2],c[3]
print("Variance of factor score for",'Corruption',"is:",d)
print("Variance of factor score for",'Peace',"is:",e)
print("Variance of factor score for",'Legal',"is:",f)
print("Variance of factor score for",'GDP',"is:",g)

Variance of factor score for Corruption is: 2.5613320842474656
Variance of factor score for Peace is: 0.9748948786370555
Variance of factor score for Legal is: 0.07303267581086802
Variance of factor score for GDP is: 0.3907403613046116


# Percent of Variance

In [9]:
h, i, j, k = d/len(c), e/len(c), f/len(c), g/len(c)
print('Corruption', "alone accounts for",'{:.2%}'.format(h),"of the variance")
print('Peace',"alone accounts for",'{:.2%}'.format(i),"of the variance")
print('Legal', "alone accounts for",'{:.2%}'.format(j),"of the variance")
print('GDP', "alone accounts for",'{:.2%}'.format(k),"of the variance")

Corruption alone accounts for 64.03% of the variance
Peace alone accounts for 24.37% of the variance
Legal alone accounts for 1.83% of the variance
GDP alone accounts for 9.77% of the variance


In [10]:
l = float(h)+float(i)
m = float(h)+float(i)+float(j)
n = float(h)+float(i)+float(j)+float(k)
print('Corruption',"accounts for",'{:.2%}'.format(h),"of the variance")
print('Corruption',"and",'Peace',"together account for over",'{:.2%}'.format(l),"of the variance")
print('Corruption,','Peace',"and",'Legal',"together account for over",'{:.2%}'.format(m),"of the variance")
print('Corruption,','Peace,','Legal',"and",'GDP',"together account for exactly",'{:.2%}'.format(n),"of the variance")

Corruption accounts for 64.03% of the variance
Corruption and Peace together account for over 88.41% of the variance
Corruption, Peace and Legal together account for over 90.23% of the variance
Corruption, Peace, Legal and GDP together account for exactly 100.00% of the variance


# The fast way to do PCA

In [11]:
pca = PCA(n_components=4)
pca.fit(X)
PCA(n_components=4)
print('Corruption','Peace', 'Legal', 'GDP Growth')
print(pca.explained_variance_ratio_)

Corruption Peace Legal GDP Growth
[0.64033302 0.24372372 0.09768509 0.01825817]
