# PCA 
More information can be found in the following papers:

https://www.sartorius.com/en/knowledge/science-snippets/what-is-principal-component-analysis-pca-and-how-it-is-used-507186

https://unstats.un.org/unsd/hhsurveys/finalpublication/ch18fin3.pdf

https://www.researchgate.net/publication/321824664_Principal_Component_Analysis_PCA_based_Indexing

https://www.researchgate.net/publication/323357069_Extent_of_Vulnerability_in_Wheat_Producing_Agro-Ecologies_of_India_Tracking_from_Indicators_of_Cross-Section_and_Multi-Dimension_Data

https://www.scielo.br/j/rbrh/a/DbyBxXCZkZyp3RcQxbNTZ5t/?lang=en#

https://academic.oup.com/heapol/article/21/6/459/612115





In [1]:
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
indicators_df = pd.read_csv("indicators_normalized.csv")

In [62]:
indicators_df.head()

Unnamed: 0,persons_04_65,females,persons_km2,non_australian,year_11_under,non_proficient,unemployment_rate,below_poverty_line,renters,multi_unit_housing,premature_deaths,copd_hosp,asthma_hosp,inf_pan_hosp,district,neg_hospital_beds,neg_protected_areas
0,0.573529,0.318102,0.386804,0.494318,0.415423,0.360042,0.375,0.575472,0.497881,0.584763,0.376071,0.437005,0.225,0.274464,Bayside,0.344828,0.915078
1,0.463235,0.351494,0.167463,0.295455,0.587065,0.202393,0.555556,0.578616,0.351695,0.059802,0.79276,1.0,0.737857,0.707992,Blacktown,0.793103,0.862101
2,1.0,0.599297,0.003754,0.028409,0.599502,0.001041,0.194444,0.496855,0.050847,0.015991,0.500414,0.387876,0.738571,0.584016,Blue Mountains,0.910345,0.0
3,0.448529,0.502636,0.617657,0.761364,0.211443,0.651405,0.597222,0.742138,0.572034,0.554342,0.327162,0.200872,0.123571,0.165302,Burwood,0.186207,1.0
4,0.433824,0.504394,0.05264,0.042614,0.783582,0.042144,0.111111,0.342767,0.101695,0.0,0.644101,0.625594,0.459286,0.661209,Camden,0.931034,0.977521


In [4]:
indicators_df["neg_hospital_beds"]= 1- indicators_df["hospital_beds_1000"]
indicators_df["neg_protected_areas"]= 1- indicators_df["protected_areas"]


In [5]:
indicators_df.head()

Unnamed: 0.1,Unnamed: 0,persons_04_65,females,persons_km2,non_australian,arrived_within_5,year_11_under,non_proficient,unemployment_rate,below_poverty_line,...,multi_unit_housing,premature_deaths,hospital_beds_1000,copd_hosp,asthma_hosp,inf_pan_hosp,protected_areas,district,neg_hospital_beds,neg_protected_areas
0,0,0.573529,0.318102,0.386804,0.494318,0.489373,0.415423,0.360042,0.375,0.575472,...,0.584763,0.376071,0.655172,0.437005,0.225,0.274464,0.084922,Bayside,0.344828,0.915078
1,1,0.463235,0.351494,0.167463,0.295455,0.267269,0.587065,0.202393,0.555556,0.578616,...,0.059802,0.79276,0.206897,1.0,0.737857,0.707992,0.137899,Blacktown,0.793103,0.862101
2,2,1.0,0.599297,0.003754,0.028409,0.022848,0.599502,0.001041,0.194444,0.496855,...,0.015991,0.500414,0.089655,0.387876,0.738571,0.584016,1.0,Blue Mountains,0.910345,0.0
3,3,0.448529,0.502636,0.617657,0.761364,0.856004,0.211443,0.651405,0.597222,0.742138,...,0.554342,0.327162,0.813793,0.200872,0.123571,0.165302,0.0,Burwood,0.186207,1.0
4,4,0.433824,0.504394,0.05264,0.042614,0.044633,0.783582,0.042144,0.111111,0.342767,...,0.0,0.644101,0.068966,0.625594,0.459286,0.661209,0.022479,Camden,0.931034,0.977521


In [6]:
indicators_df = indicators_df.drop(["Unnamed: 0", "arrived_within_5", "hospital_beds_1000", "protected_areas"], axis=1)

In [7]:
indicators_df.columns

Index(['persons_04_65', 'females', 'persons_km2', 'non_australian',
       'year_11_under', 'non_proficient', 'unemployment_rate',
       'below_poverty_line', 'renters', 'multi_unit_housing',
       'premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp',
       'district', 'neg_hospital_beds', 'neg_protected_areas'],
      dtype='object')

In [63]:
indicators_df.head()

Unnamed: 0,persons_04_65,females,persons_km2,non_australian,year_11_under,non_proficient,unemployment_rate,below_poverty_line,renters,multi_unit_housing,premature_deaths,copd_hosp,asthma_hosp,inf_pan_hosp,district,neg_hospital_beds,neg_protected_areas
0,0.573529,0.318102,0.386804,0.494318,0.415423,0.360042,0.375,0.575472,0.497881,0.584763,0.376071,0.437005,0.225,0.274464,Bayside,0.344828,0.915078
1,0.463235,0.351494,0.167463,0.295455,0.587065,0.202393,0.555556,0.578616,0.351695,0.059802,0.79276,1.0,0.737857,0.707992,Blacktown,0.793103,0.862101
2,1.0,0.599297,0.003754,0.028409,0.599502,0.001041,0.194444,0.496855,0.050847,0.015991,0.500414,0.387876,0.738571,0.584016,Blue Mountains,0.910345,0.0
3,0.448529,0.502636,0.617657,0.761364,0.211443,0.651405,0.597222,0.742138,0.572034,0.554342,0.327162,0.200872,0.123571,0.165302,Burwood,0.186207,1.0
4,0.433824,0.504394,0.05264,0.042614,0.783582,0.042144,0.111111,0.342767,0.101695,0.0,0.644101,0.625594,0.459286,0.661209,Camden,0.931034,0.977521


In [48]:
# Information PCA https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

data = indicators_df.drop(["district"], axis=1)
# PCA
pca = PCA()
pca.fit_transform(X=data)
# df_pca = pd.DataFrame(df_pca, columns=['persons_04_65', 'females', 'persons_km2', 'non_australian',
#                                         'year_11_under', 'non_proficient', 'unemployment_rate',
#                                         'below_poverty_line', 'renters', 'multi_unit_housing',
#                                         'premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp',
#                                         'neg_hospital_beds', 'neg_protected_areas'])

In [49]:
weights = pca.components_

In [51]:
weights_df = pd.DataFrame(weights, columns=['persons_04_65', 'females', 'persons_km2', 'non_australian',
                                        'year_11_under', 'non_proficient', 'unemployment_rate',
                                        'below_poverty_line', 'renters', 'multi_unit_housing',
                                        'premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp',
                                        'neg_hospital_beds', 'neg_protected_areas'])

In [83]:
weights_df.head(1)

Unnamed: 0,persons_04_65,females,persons_km2,non_australian,year_11_under,non_proficient,unemployment_rate,below_poverty_line,renters,multi_unit_housing,premature_deaths,copd_hosp,asthma_hosp,inf_pan_hosp,neg_hospital_beds,neg_protected_areas
0,0.052699,-0.064037,-0.31624,-0.244323,0.383809,-0.037466,0.079417,0.13345,-0.218181,-0.391929,0.168997,0.320244,0.287105,0.267054,0.369551,-0.194808


In [55]:
print(sum(pca.explained_variance_ratio_))

1.0000000000000002


In [56]:
print(pca.explained_variance_ratio_)

[4.73824256e-01 2.63433768e-01 9.32389914e-02 4.82346890e-02
 3.55996789e-02 2.77017208e-02 2.47625973e-02 1.14086252e-02
 6.79537415e-03 5.74321574e-03 3.02134250e-03 2.66844038e-03
 1.74741196e-03 1.30981837e-03 3.17335012e-04 1.92734928e-04]


In [64]:
# PCA by category
# Socio-Demographics
demo_df = indicators_df[['persons_04_65','females','persons_km2']]
# Socio Educational
edu_df = indicators_df[['non_australian','year_11_under', 'non_proficient']]
# Economic
eco_df = indicators_df[['unemployment_rate','below_poverty_line']]
# Health
health_df = indicators_df[['premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp','neg_hospital_beds']]
# Built environment
env_df = indicators_df[['renters', 'multi_unit_housing','neg_protected_areas']]

In [72]:
# Socio-Demographics
pca_demo = PCA(1)
pca_demo.fit_transform(X=demo_df)
weights_demo = pca_demo.components_

In [73]:
weights_demo

array([[-0.50509671, -0.26529065,  0.82127838]])

In [74]:
weights_demo_df = pd.DataFrame(weights_demo, columns=['persons_04_65','females','persons_km2'])

In [75]:
weights_demo_df

Unnamed: 0,persons_04_65,females,persons_km2
0,-0.505097,-0.265291,0.821278


In [76]:
# Socio-Educational
pca_edu = PCA(1)
pca_edu.fit_transform(X=edu_df)
weights_edu = pca_edu.components_
weights_edu_df = pd.DataFrame(weights_edu, columns=['non_australian','year_11_under', 'non_proficient'])
weights_edu_df

Unnamed: 0,non_australian,year_11_under,non_proficient
0,-0.600776,0.75188,-0.27156


In [78]:
# Economic/Financial
pca_eco = PCA(1)
pca_eco.fit_transform(X=eco_df)
weights_eco = pca_eco.components_
weights_eco_df = pd.DataFrame(weights_eco, columns=['unemployment_rate','below_poverty_line'])
weights_eco_df

Unnamed: 0,unemployment_rate,below_poverty_line
0,0.763536,0.645765


In [79]:
# Health
pca_health = PCA(1)
pca_health.fit_transform(X=health_df)
weights_health = pca_health.components_
weights_health_df = pd.DataFrame(weights_health, columns=['premature_deaths', 'copd_hosp', 'asthma_hosp', 'inf_pan_hosp','neg_hospital_beds'])
weights_health_df

Unnamed: 0,premature_deaths,copd_hosp,asthma_hosp,inf_pan_hosp,neg_hospital_beds
0,0.380248,0.532715,0.446276,0.385843,0.472852


In [84]:
# Built Environment 
pca_env = PCA(1)
pca_env.fit_transform(X=env_df)
weights_env = pca_env.components_
weights_env_df = pd.DataFrame(weights_env, columns=['renters', 'multi_unit_housing','neg_protected_areas'])
weights_env_df

Unnamed: 0,renters,multi_unit_housing,neg_protected_areas
0,-0.533682,-0.644132,-0.547976
