# PCA 
More information can be found in the following papers:

https://www.sartorius.com/en/knowledge/science-snippets/what-is-principal-component-analysis-pca-and-how-it-is-used-507186

https://unstats.un.org/unsd/hhsurveys/finalpublication/ch18fin3.pdf

https://www.researchgate.net/publication/321824664_Principal_Component_Analysis_PCA_based_Indexing

https://www.researchgate.net/publication/323357069_Extent_of_Vulnerability_in_Wheat_Producing_Agro-Ecologies_of_India_Tracking_from_Indicators_of_Cross-Section_and_Multi-Dimension_Data

https://www.scielo.br/j/rbrh/a/DbyBxXCZkZyp3RcQxbNTZ5t/?lang=en#

https://academic.oup.com/heapol/article/21/6/459/612115





In [1]:
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
indicators_df = pd.read_csv("indicators_normalized.csv")

In [3]:
indicators_df.head()

Unnamed: 0.1,Unnamed: 0,persons_04_65,females,persons_km2,non_australian,arrived_within_5,year_11_under,non_proficient,unemployment_rate,below_poverty_line,renters,multi_unit_housing,premature_deaths,hospital_beds_1000,copd_hosp,asthma_hosp,inf_pan_hosp,protected_areas,district
0,0,0.573529,0.318102,0.386804,0.494318,0.489373,0.415423,0.360042,0.375,0.575472,0.497881,0.584763,0.376071,0.655172,0.437005,0.225,0.274464,0.084922,Bayside
1,1,0.463235,0.351494,0.167463,0.295455,0.267269,0.587065,0.202393,0.555556,0.578616,0.351695,0.059802,0.79276,0.206897,1.0,0.737857,0.707992,0.137899,Blacktown
2,2,1.0,0.599297,0.003754,0.028409,0.022848,0.599502,0.001041,0.194444,0.496855,0.050847,0.015991,0.500414,0.089655,0.387876,0.738571,0.584016,1.0,Blue Mountains
3,3,0.448529,0.502636,0.617657,0.761364,0.856004,0.211443,0.651405,0.597222,0.742138,0.572034,0.554342,0.327162,0.813793,0.200872,0.123571,0.165302,0.0,Burwood
4,4,0.433824,0.504394,0.05264,0.042614,0.044633,0.783582,0.042144,0.111111,0.342767,0.101695,0.0,0.644101,0.068966,0.625594,0.459286,0.661209,0.022479,Camden


In [4]:
indicators_df["neg_hospital_beds"]= 1- indicators_df["hospital_beds_1000"]
indicators_df["neg_protected_areas"]= 1- indicators_df["protected_areas"]


In [5]:
indicators_df.head()

Unnamed: 0.1,Unnamed: 0,persons_04_65,females,persons_km2,non_australian,arrived_within_5,year_11_under,non_proficient,unemployment_rate,below_poverty_line,...,multi_unit_housing,premature_deaths,hospital_beds_1000,copd_hosp,asthma_hosp,inf_pan_hosp,protected_areas,district,neg_hospital_beds,neg_protected_areas
0,0,0.573529,0.318102,0.386804,0.494318,0.489373,0.415423,0.360042,0.375,0.575472,...,0.584763,0.376071,0.655172,0.437005,0.225,0.274464,0.084922,Bayside,0.344828,0.915078
1,1,0.463235,0.351494,0.167463,0.295455,0.267269,0.587065,0.202393,0.555556,0.578616,...,0.059802,0.79276,0.206897,1.0,0.737857,0.707992,0.137899,Blacktown,0.793103,0.862101
2,2,1.0,0.599297,0.003754,0.028409,0.022848,0.599502,0.001041,0.194444,0.496855,...,0.015991,0.500414,0.089655,0.387876,0.738571,0.584016,1.0,Blue Mountains,0.910345,0.0
3,3,0.448529,0.502636,0.617657,0.761364,0.856004,0.211443,0.651405,0.597222,0.742138,...,0.554342,0.327162,0.813793,0.200872,0.123571,0.165302,0.0,Burwood,0.186207,1.0
4,4,0.433824,0.504394,0.05264,0.042614,0.044633,0.783582,0.042144,0.111111,0.342767,...,0.0,0.644101,0.068966,0.625594,0.459286,0.661209,0.022479,Camden,0.931034,0.977521


In [6]:
indicators_df = indicators_df.drop(["Unnamed: 0", "arrived_within_5", "hospital_beds_1000", "protected_areas"], axis=1)

In [7]:
indicators_df.columns

Index(['persons_04_65', 'females', 'persons_km2', 'non_australian',
       'year_11_under', 'non_proficient', 'unemployment_rate',
       'below_poverty_line', 'renters', 'multi_unit_housing',
       'premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp',
       'district', 'neg_hospital_beds', 'neg_protected_areas'],
      dtype='object')

In [8]:
indicators_df.head()

Unnamed: 0,persons_04_65,females,persons_km2,non_australian,year_11_under,non_proficient,unemployment_rate,below_poverty_line,renters,multi_unit_housing,premature_deaths,copd_hosp,asthma_hosp,inf_pan_hosp,district,neg_hospital_beds,neg_protected_areas
0,0.573529,0.318102,0.386804,0.494318,0.415423,0.360042,0.375,0.575472,0.497881,0.584763,0.376071,0.437005,0.225,0.274464,Bayside,0.344828,0.915078
1,0.463235,0.351494,0.167463,0.295455,0.587065,0.202393,0.555556,0.578616,0.351695,0.059802,0.79276,1.0,0.737857,0.707992,Blacktown,0.793103,0.862101
2,1.0,0.599297,0.003754,0.028409,0.599502,0.001041,0.194444,0.496855,0.050847,0.015991,0.500414,0.387876,0.738571,0.584016,Blue Mountains,0.910345,0.0
3,0.448529,0.502636,0.617657,0.761364,0.211443,0.651405,0.597222,0.742138,0.572034,0.554342,0.327162,0.200872,0.123571,0.165302,Burwood,0.186207,1.0
4,0.433824,0.504394,0.05264,0.042614,0.783582,0.042144,0.111111,0.342767,0.101695,0.0,0.644101,0.625594,0.459286,0.661209,Camden,0.931034,0.977521


In [9]:
# Information PCA https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

data = indicators_df.drop(["district"], axis=1)
# PCA
pca = PCA(16)
pca.fit(X=data)
# df_pca = pd.DataFrame(df_pca, columns=['persons_04_65', 'females', 'persons_km2', 'non_australian',
#                                         'year_11_under', 'non_proficient', 'unemployment_rate',
#                                         'below_poverty_line', 'renters', 'multi_unit_housing',
#                                         'premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp',
#                                         'neg_hospital_beds', 'neg_protected_areas'])

PCA(n_components=16)

In [10]:
weights = pca.components_

In [11]:
weights_df = pd.DataFrame(weights, columns=['persons_04_65', 'females', 'persons_km2', 'non_australian',
                                        'year_11_under', 'non_proficient', 'unemployment_rate',
                                        'below_poverty_line', 'renters', 'multi_unit_housing',
                                        'premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp',
                                        'neg_hospital_beds', 'neg_protected_areas'])

In [12]:
# Rows represent the principal components for example PC1 = row 0, PC2 = row 1, etc
weights_df

Unnamed: 0,persons_04_65,females,persons_km2,non_australian,year_11_under,non_proficient,unemployment_rate,below_poverty_line,renters,multi_unit_housing,premature_deaths,copd_hosp,asthma_hosp,inf_pan_hosp,neg_hospital_beds,neg_protected_areas
0,0.052699,-0.064037,-0.31624,-0.244323,0.383809,-0.037466,0.079417,0.13345,-0.218181,-0.391929,0.168997,0.320244,0.287105,0.267054,0.369551,-0.194808
1,-0.229273,-0.250234,0.174966,0.277466,0.077867,0.344928,0.428667,0.305478,0.287263,0.077098,0.262723,0.262031,0.177567,0.085717,-0.048562,0.32821
2,-0.268035,0.003622,0.251697,0.070982,0.084311,-0.445454,-0.305578,-0.327459,0.233682,0.197875,0.45661,0.314716,-0.068456,0.061057,0.205055,-0.042973
3,0.218433,0.425637,-0.010231,-0.162706,-0.234306,-0.125178,0.022532,-0.205332,0.022293,-0.011281,0.05423,0.000838,0.486324,0.243782,-0.049559,0.573882
4,-0.431438,-0.072792,-0.112595,-0.058816,-0.030067,0.066134,-0.068377,-0.076758,-0.281439,-0.268106,0.037981,-0.195619,-0.359141,-0.115514,0.355121,0.563334
5,-0.037669,-0.515979,-0.091727,-0.14381,0.090649,-0.202835,-0.254129,-0.060476,-0.020546,-0.103085,-0.306018,0.304753,-0.047771,0.21411,-0.517932,0.267775
6,-0.177702,-0.511459,-0.131406,0.162048,-0.465779,-0.110532,-0.000276,-0.123578,-0.059415,0.028413,0.047098,-0.365685,0.417675,0.193983,0.17406,-0.190278
7,-0.360654,0.368964,-0.252354,0.275007,-0.425521,-0.029493,0.168553,-0.028576,-0.040185,-0.19926,-0.14059,0.361737,-0.205245,0.287462,-0.149242,-0.210586
8,0.317387,-0.085258,-0.380326,0.364275,-0.172901,-0.37812,-0.006514,0.228391,0.019753,-0.088493,0.112884,0.210619,0.028417,-0.541426,0.055357,0.165043
9,-0.119729,0.090904,-0.666675,0.124825,0.387531,-0.018003,0.078731,-0.163755,0.185059,0.320707,0.171313,-0.311237,-0.065305,0.147318,-0.204399,0.05276


In [13]:
# Printing weights for PC1
weights_df.head(1)

Unnamed: 0,persons_04_65,females,persons_km2,non_australian,year_11_under,non_proficient,unemployment_rate,below_poverty_line,renters,multi_unit_housing,premature_deaths,copd_hosp,asthma_hosp,inf_pan_hosp,neg_hospital_beds,neg_protected_areas
0,0.052699,-0.064037,-0.31624,-0.244323,0.383809,-0.037466,0.079417,0.13345,-0.218181,-0.391929,0.168997,0.320244,0.287105,0.267054,0.369551,-0.194808


In [14]:
print(sum(pca.explained_variance_ratio_))

1.0000000000000002


In [15]:
# explained_variance_ratio_ndarray of shape (n_components,) is the percentage of variance explained by each of the selected components.
# If n_components is not set then all components are stored and the sum of the ratios is equal to 1.0.
print(pca.explained_variance_ratio_)

[4.73824256e-01 2.63433768e-01 9.32389914e-02 4.82346890e-02
 3.55996789e-02 2.77017208e-02 2.47625973e-02 1.14086252e-02
 6.79537415e-03 5.74321574e-03 3.02134250e-03 2.66844038e-03
 1.74741196e-03 1.30981837e-03 3.17335012e-04 1.92734928e-04]


In [16]:
# Rows represent the principal components for example PC1 = row 0, PC2 = row 1, etc
ratio_df = pd.DataFrame(pca.explained_variance_ratio_)
ratio_df

Unnamed: 0,0
0,0.473824
1,0.263434
2,0.093239
3,0.048235
4,0.0356
5,0.027702
6,0.024763
7,0.011409
8,0.006795
9,0.005743


In [17]:
# PCA by category
# Socio-Demographics
demo_df = indicators_df[['persons_04_65','females','persons_km2']]
# Socio Educational
edu_df = indicators_df[['non_australian','year_11_under', 'non_proficient']]
# Economic
eco_df = indicators_df[['unemployment_rate','below_poverty_line']]
# Health
health_df = indicators_df[['premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp','neg_hospital_beds']]
# Built environment
env_df = indicators_df[['renters', 'multi_unit_housing','neg_protected_areas']]

In [18]:
# Socio-Demographics
pca_demo = PCA(1)
pca_demo.fit(X=demo_df)
weights_demo = pca_demo.components_

In [19]:
weights_demo

array([[-0.50509671, -0.26529065,  0.82127838]])

In [20]:
weights_demo_df = pd.DataFrame(weights_demo, columns=['persons_04_65','females','persons_km2'])

In [21]:
weights_demo_df

Unnamed: 0,persons_04_65,females,persons_km2
0,-0.505097,-0.265291,0.821278


In [22]:
# Rows represent the principal components for example PC1 = row 0, PC2 = row 1, etc
ratio_demo_df = pd.DataFrame(pca_demo.explained_variance_ratio_)
ratio_demo_df

Unnamed: 0,0
0,0.556302


In [23]:
# Socio-Educational
pca_edu = PCA(1)
pca_edu.fit_transform(X=edu_df)
weights_edu = pca_edu.components_
weights_edu_df = pd.DataFrame(weights_edu, columns=['non_australian','year_11_under', 'non_proficient'])
weights_edu_df

Unnamed: 0,non_australian,year_11_under,non_proficient
0,-0.600776,0.75188,-0.27156


In [24]:
# Rows represent the principal components for example PC1 = row 0, PC2 = row 1, etc
ratio_edu_df = pd.DataFrame(pca_edu.explained_variance_ratio_)
ratio_edu_df

Unnamed: 0,0
0,0.600288


In [25]:
# Economic/Financial
pca_eco = PCA(1)
pca_eco.fit_transform(X=eco_df)
weights_eco = pca_eco.components_
weights_eco_df = pd.DataFrame(weights_eco, columns=['unemployment_rate','below_poverty_line'])
weights_eco_df

Unnamed: 0,unemployment_rate,below_poverty_line
0,0.763536,0.645765


In [26]:
# Rows represent the principal components for example PC1 = row 0, PC2 = row 1, etc
ratio_eco_df = pd.DataFrame(pca_eco.explained_variance_ratio_)
ratio_eco_df

Unnamed: 0,0
0,0.950346


In [27]:
# Health
pca_health = PCA(1)
pca_health.fit_transform(X=health_df)
weights_health = pca_health.components_
weights_health_df = pd.DataFrame(weights_health, columns=['premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp','neg_hospital_beds'])
weights_health_df

Unnamed: 0,premature_deaths,copd_hosp,asthma_hosp,inf_pan_hosp,neg_hospital_beds
0,0.380248,0.532715,0.446276,0.385843,0.472852


In [28]:
# Rows represent the principal components for example PC1 = row 0, PC2 = row 1, etc
ratio_health_df = pd.DataFrame(pca_health.explained_variance_ratio_)
ratio_health_df

Unnamed: 0,0
0,0.729075


In [29]:
# Built Environment 
pca_env = PCA(1)
pca_env.fit_transform(X=env_df)
weights_env = pca_env.components_
weights_env_df = pd.DataFrame(weights_env, columns=['renters', 'multi_unit_housing','neg_protected_areas'])
weights_env_df

Unnamed: 0,renters,multi_unit_housing,neg_protected_areas
0,-0.533682,-0.644132,-0.547976


In [30]:
# Rows represent the principal components for example PC1 = row 0, PC2 = row 1, etc
ratio_env_df = pd.DataFrame(pca_env.explained_variance_ratio_)
ratio_env_df

Unnamed: 0,0
0,0.740665
