Principal Component Alanysis (PCA)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# best regularization
# def mean_normalize(features):
#     return (features - features.mean(axis = 0)) / np.std(features, axis = 0) - 0.5
def mean_normalize(features):
    return (features - features.mean(axis = 0)) / np.std(features, axis = 0)

In [None]:
class PrincipalComponentAnalysis:

    def __init__(self, features, count_unit_vectors):
        self.features = mean_normalize(features)
        self.count_unit_vectors = count_unit_vectors

    def fit(self):
        n, m = features.shape
        sigma = np.cov(self.features)

        U, S, V = np.linalg.svd(sigma)

        self.unit_vectors = U[:, :self.count_unit_vectors].T @ self.features # z
        self.approximation = (self.unit_vectors @ self.features.T).T
        
        self.cost = 1 - np.sum(S[:self.count_unit_vectors], axis = None) / np.sum(S, axis = None)

    def get_unit_vectors(self):
        return self.unit_vectors
    
    def get_approximation(self):
        return self.approximation

    def get_cost(self):
        return self.cost

In [None]:
df = pd.read_csv('HW_Week8-PCA_REAL_data.csv')
pd.set_option('display.max_rows', None)

In [None]:
df = df[df['Population in thousands (2017)'] > 5000] # taking out the trash
df = df[df['country'] != 'Venezuela (Bolivarian Republic of)']
df = df[df['country'] != 'Sudan']
df = df.drop(columns = ['Net Official Development Assist. received (% of GNI)'])
df = df.reset_index(drop=True)
df

Unnamed: 0,country,Region,Surface area (km2),Population in thousands (2017),"Population density (per km2, 2017)","Sex ratio (m per 100 f, 2017)",GDP: Gross domestic product (million current US$),"GDP growth rate (annual %, const. 2005 prices)",GDP per capita (current US$),Economy: Agriculture (% of GVA),Economy: Industry (% of GVA),Economy: Services and other activity (% of GVA),Employment: Agriculture (% of employed),Employment: Industry (% of employed),Employment: Services (% of employed),Unemployment (% of labour force),Labour force participation (female/male pop. %),Agricultural production index (2004-2006=100),Food production index (2004-2006=100),International trade: Exports (million US$),International trade: Imports (million US$),International trade: Balance (million US$),"Balance of payments, current account (million US$)",Population growth rate (average annual %),Urban population (% of total population),Urban population growth rate (average annual %),"Fertility rate, total (live births per woman)","Life expectancy at birth (females/males, years)","Population age distribution (0-14 / 60+ years, %)",International migrant stock (000/% of total pop.),Refugees and others of concern to UNHCR (in thousands),Infant mortality rate (per 1000 live births,Health: Total expenditure (% of GDP),Health: Physicians (per 1000 pop.),Education: Government expenditure (% of GDP),Education: Primary gross enrol. ratio (f/m per 100 pop.),Education: Secondary gross enrol. ratio (f/m per 100 pop.),Education: Tertiary gross enrol. ratio (f/m per 100 pop.),Seats held by women in national parliaments %,Mobile-cellular subscriptions (per 100 inhabitants),Mobile-cellular subscriptions (per 100 inhabitants).1,Individuals using the Internet (per 100 inhabitants),Threatened species (number),Forested area (% of land area),CO2 emission estimates (million tons/tons per capita),"Energy production, primary (Petajoules)",Energy supply per capita (Gigajoules),"Pop. using improved drinking water (urban/rural, %)","Pop. using improved sanitation facilities (urban/rural, %)"
0,Afghanistan,SouthernAsia,652864,35530,54.4,106.3,20270,-2.4,623.2,23.3,23.3,53.3,61.6,10.0,28.5,8.6,19.3/83.6,125,125,1458,3568,-2110,-5121,3.2,26.7,4.0,5.3,63.5/61.0,43.2/4.1,382.4/1.2,1513.1,68.6,8.2,0.3,3.3,91.1/131.6,39.7/70.7,3.7/13.3,27.7,61.6,8.3,42,2.1,9.8/0.3,63,5,78.2/47.0,45.1/27.0,21.43
1,Algeria,NorthernAfrica,2381741,41318,17.3,102.0,164779,3.8,4154.1,12.2,37.3,50.5,10.8,34.5,54.7,11.4,17.0/70.7,160,161,29992,47091,-17099,-27229,2.0,70.7,2.8,3.0,76.5/74.1,29.3/9.4,242.4/0.6,99.8,27.7,7.2,...,...,112.7/119.5,101.7/98.1,45.1/28.9,31.6,113.0,38.2,135,0.8,145.4/3.7,5900,55,84.3/81.8,89.8/82.2,0.05
2,Angola,MiddleAfrica,1246700,29784,23.9,96.2,117955,3.0,4714.1,6.8,51.2,42.0,4.2,37.6,58.2,6.6,59.8/77.1,175,176,21011,8790,12221,-10273,3.5,44.1,5.0,6.0,63.0/57.4,46.8/4.0,106.8/0.4,45.7,65.4,3.3,...,...,100.4/156.9,22.7/35.1,8.2/10.4,38.2,60.8,12.4,146,46.5,34.8/1.4,3902,25,75.4/28.2,88.6/22.5,0.42
3,Argentina,SouthAmerica,2780400,44271,16.2,95.9,632343,2.4,14564.5,6.0,27.8,66.2,2.0,24.8,73.1,6.5,48.6/74.4,119,119,57733,55610,2124,-15944,1.0,91.8,1.0,2.3,79.8/72.2,24.9/15.4,2086.3/4.8,5.0,13.7,4.8,3.8,5.3,109.8/110.2,110.3/103.4,102.9/63.5,38.9,143.9,69.4,256,10.0,204.0/4.7,3167,85,99.0/100.0,96.2/98.3,0.01
4,Australia,Oceania,7692060,24451,3.2,99.3,1230859,2.4,51352.2,2.5,26.5,71.1,2.7,21.2,76.1,5.5,58.4/70.7,111,111,189630,189406,224,-57746,1.5,89.4,1.5,1.9,84.4/80.2,19.0/21.0,6763.7/28.2,58.2,3.9,9.4,3.4,5.2,102.1/102.3,133.6/141.3,106.3/75.4,28.7,132.8,84.6,948,16.2,361.3/15.3,15282,222,100.0/100.0,100.0/100.0,-99
5,Austria,WesternEurope,83871,8736,106.0,96.2,376967,1.0,44117.7,1.3,28.3,70.4,4.7,25.6,69.7,6.2,54.6/65.8,108,108,145503,149299,-3795,7020,0.6,66.0,0.4,1.4,83.5/78.4,14.1/25.1,1492.4/17.5,166.4,3.3,11.2,5.2,5.5,102.2/103.7,97.6/102.4,89.2/74.3,30.6,157.4,83.9,118,46.9,58.7/6.9,505,158,100.0/100.0,100.0/100.0,-99
6,Azerbaijan,WesternAsia,86600,9828,118.9,99.3,53049,0.7,5438.7,6.7,49.9,43.4,36.7,14.2,49.1,5.2,62.0/68.8,131,136,9143,8532,611,-222,1.3,54.6,1.6,2.1,74.6/68.6,23.3/10.1,264.2/2.7,623.3,31.4,6.0,3.4,2.6,105.6/107.4,-99,27.5/23.6,16.8,111.3,77.0,97,13.5,37.5/3.9,2459,61,94.7/77.8,91.6/86.6,0.14
7,Bangladesh,SouthernAsia,147570,164670,1265.0,101.7,194466,6.6,1207.9,15.5,28.1,56.3,40.6,19.1,40.3,4.0,43.2/81.1,141,140,36031,52624,-16593,2687,1.2,34.3,3.6,2.2,72.9/69.8,28.4/7.3,1422.8/0.9,233.0,33.3,2.8,0.4,1.9,125.1/116.0,67.4/59.8,11.4/15.4,20.3,83.4,14.4,151,11.0,73.2/0.5,1438,11,86.5/87.0,57.7/62.1,1.24
8,Belarus,EasternEurope,207600,9468,46.7,87.0,54609,-3.9,5750.8,7.5,38.9,53.6,9.6,32.0,58.4,0.5,54.0/67.8,122,122,23414,27464,-4050,-2037,~0.0,76.7,~0.0,1.6,77.7/66.5,16.7/21.3,1082.9/11.4,7.9,3.6,5.7,4.1,4.9,101.3/101.4,106.4/107.8,100.7/75.9,34.5,123.6,62.2,25,42.5,63.5/6.7,155,122,99.9/99.1,94.1/95.2,0.20
9,Belgium,WesternEurope,30528,11429,377.5,97.3,455107,1.5,40277.8,0.7,22.2,77.1,1.2,21.2,77.6,8.3,48.1/58.9,108,107,398033,372713,25321,1936,0.6,97.9,0.5,1.8,83.0/78.0,17.1/24.6,1387.9/12.3,63.8,3.5,10.6,3.0,6.6,104.2/104.2,177.7/156.4,85.4/65.0,38.0,115.7,85.1,37,22.6,93.4/8.3,520,196,100.0/100.0,99.5/99.4,-99


In [None]:
country_names = df['country'].to_numpy()
# print(country_names)

cols_to_remove = []

for col in df.columns:
    try:
        _ = df[col].astype(float)
    except ValueError:
        cols_to_remove.append(col)
        pass

df = df.drop(columns=cols_to_remove)

In [None]:
# choosing normal feautures
new_df = df[['Surface area (km2)', 'Population in thousands (2017)',
             'GDP: Gross domestic product (million current US$)'
             ]].copy()

In [None]:
for col in df.columns:
    df[col] = df[col].astype(float)

In [None]:
m, n = df.shape
print(m, n)
features = df.to_numpy()

116 28


In [None]:
PCA = PrincipalComponentAnalysis(features = features, count_unit_vectors = 4)
PCA.fit()
approx = PCA.get_approximation()

for i in range(len(country_names)):
    print(i, country_names[i], approx[i])

0 Afghanistan [-101.71459663    2.96787345  -26.48023474    3.85139627]
1 Algeria [ -9.96274391 -15.17770149  16.85260237  -4.07793429]
2 Angola [-84.03141866 -14.67415399  20.2723753  -14.45682512]
3 Argentina [ 62.19623165 -17.11850638  11.33812435  -5.82760772]
4 Australia [123.32521799  17.87398528  29.32059989 -12.89327177]
5 Austria [ 94.52078629 -28.44112797  12.6332571   -9.01943644]
6 Azerbaijan [-12.89146629 -16.57341451 -10.34334893 -10.3240356 ]
7 Bangladesh [-48.3702887   -1.83444813  -5.17281185  17.36319376]
8 Belarus [ 38.26535059 -33.21539286  12.93465698 -14.35810371]
9 Belgium [116.5948625  -16.54250417  10.23972196  -4.51288068]
10 Benin [-91.49296419  -9.09485127  -4.5597431   14.96260262]
11 Bolivia (Plurinational State of) [-30.01507152 -21.18798177  17.35287462 -13.80589384]
12 Brazil [55.93068838 25.15120649 50.17161254  1.66074114]
13 Bulgaria [ 45.27174318 -30.21865521   9.09743707   0.12913303]
14 Burkina Faso [-118.43409622    3.90534224  -25.46344974   13.

In [None]:
ind = np.argsort(approx[:, 0]) # by life quality
print(country_names[ind])

['Chad' 'Niger' 'Burundi' 'Somalia' 'Sierra Leone' 'Malawi' 'Ethiopia'
 'Burkina Faso' 'United Republic of Tanzania'
 "Lao People's Democratic Republic" 'Democratic Republic of the Congo'
 'Mozambique' 'Mali' 'Togo' 'Guinea' 'Uganda' 'Afghanistan' 'Rwanda'
 'Cameroon' 'Zambia' 'Eritrea' 'Benin' 'Madagascar' 'Papua New Guinea'
 'Tajikistan' 'Angola' 'Nepal' 'Cambodia' 'Haiti' 'Pakistan' 'Kenya'
 'Zimbabwe' "Democratic People's Republic of Korea" 'Senegal' 'Congo'
 'South Sudan' 'Yemen' 'Myanmar' 'Ghana' 'Bangladesh' 'Uzbekistan'
 'Nigeria' 'Guatemala' 'Iraq' 'India' 'Bolivia (Plurinational State of)'
 'Viet Nam' 'Honduras' 'Paraguay' 'Nicaragua' 'Sri Lanka' 'Kyrgyzstan'
 'Indonesia' 'Egypt' 'Philippines' 'Azerbaijan' 'Turkmenistan' 'Algeria'
 'Morocco' 'Ecuador' 'Syrian Arab Republic' 'Peru' 'Dominican Republic'
 'Libya' 'Thailand' 'El Salvador' 'Cuba' 'Tunisia' 'Romania' 'Colombia'
 'Serbia' 'Iran (Islamic Republic of)' 'Turkey' 'Ukraine' 'South Africa'
 'Jordan' 'Mexico' 'Belarus' 'Ma

In [None]:
ind = np.argsort(approx[:, 3]) # by country power (decreasing)
print(country_names[ind])

['South Sudan' 'United Arab Emirates' 'Saudi Arabia' 'Congo' 'Libya'
 'Norway' 'Finland' 'Canada' 'Sweden' 'Iraq' 'Turkmenistan' 'Ecuador'
 'Russian Federation' 'China' 'Mexico' 'Angola' 'Belarus'
 'Bolivia (Plurinational State of)' 'Australia' 'Republic of Korea' 'Cuba'
 'Democratic Republic of the Congo' 'Serbia' 'Czechia' 'Azerbaijan'
 'Malaysia' 'Germany' 'Slovakia' 'Austria' 'Spain' 'Indonesia' 'Nicaragua'
 'Colombia' 'South Africa' 'Zimbabwe' 'Iran (Islamic Republic of)'
 'Argentina' 'Poland' 'Denmark' 'Viet Nam' 'Switzerland' 'Romania' 'Peru'
 'Belgium' 'Algeria' 'Kazakhstan' 'Syrian Arab Republic' 'Portugal'
 'Italy' 'Chile' 'Tunisia' 'Papua New Guinea' 'Senegal' 'Philippines'
 'United States of America' 'Uganda' 'Netherlands' 'Honduras' 'Myanmar'
 'Bulgaria' 'France' 'Japan' 'Hungary' 'Mozambique' 'Rwanda'
 "Lao People's Democratic Republic" 'United Republic of Tanzania' 'Guinea'
 'Egypt' 'Brazil' 'Cameroon' 'Dominican Republic' 'Afghanistan' 'Paraguay'
 'Chad' 'Thailand' 'Gre

In [None]:
# plt.plot(approx[:, 0], approx[:, 1], 'r.')
# plt.show()

In [None]:
unit_vectors = PCA.get_unit_vectors()
print(unit_vectors)

[[ 2.43370657e+00  3.69244125e-01  3.15496397e+00  8.33040059e-01
   4.08813278e+00 -1.53603969e+00  8.42325457e+00 -2.97271961e-04
   8.22161953e+00 -8.72567158e+00  3.56373005e+00  7.95632379e+00
  -5.29074517e+00 -5.37338488e+00  5.46048914e+00  5.69783903e+00
  -9.58009764e-01  9.00958713e+00 -8.75683718e+00 -9.33165035e+00
   3.12788383e-01 -8.88879525e-01  7.17242013e+00  9.79475879e+00
   3.04650251e-01 -1.01243194e+00  3.04552648e+00  8.37643005e+00]
 [ 5.94686585e+00  7.16821287e+00 -1.20288761e-01 -2.01531831e-01
   8.26289425e+00  5.87147763e-01 -4.50908247e-01  1.75593527e-01
  -1.27461000e+00  8.63462534e-02 -3.79725896e+00 -3.49618548e+00
  -2.36875083e+00 -2.33992264e+00  7.02584686e+00  7.42272386e+00
  -1.50584926e+00 -2.20547220e+00  1.47270419e+00  1.94043912e+00
  -1.31480594e+00 -1.69919473e+00 -2.81599333e+00 -1.69026207e+00
   4.31948697e+00 -2.30442119e+00  8.39122343e+00 -1.81474761e-01]
 [ 3.20547200e+00  2.58773203e+00 -3.85682032e+00 -3.58750211e+00
   3.932

In [None]:
print(PCA.get_cost())

0.39915985381310437


In [None]:
x = np.array([[-2, 2], [-1, 1], [0, 0], [1, -1], [2, -2]])
PCA = PrincipalComponentAnalysis(x, 1)
PCA.fit()
print(PCA.get_unit_vectors())
print(PCA.get_approximation())


[[ 2.23606798 -2.23606798]]
[[-6.32455532]
 [-3.16227766]
 [ 0.        ]
 [ 3.16227766]
 [ 6.32455532]]
