### This script performs a PCA (R-verified)

In [1]:
%load_ext rpy2.ipython
%config InlineBackend.figure_format = 'retina'

import pandas as pd
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')

#### set directory and import dataset

In [2]:
datadir = '/Volumes/Maxtor/phdresearch/som_new/'
outdir = '/Volumes/Maxtor/phdresearch/som_new/'

In [3]:
data = pd.read_csv(datadir + 'SAfr_700hpa_era_newAB.csv',names = ['0,0','0,1','0,2','1,0','1,1','1,2','2,0','2,1','2,2','3,0','3,1'
         ,'3,2','MEI_AVG','AAO_AVG','ITCZInt_AVG','MJO_AVG','DMI_AVG'])

#### select specific columns in dataset

In [4]:
df = data.iloc[:,0:15]

In [5]:
type(df); df.head()

Unnamed: 0,"0,0","0,1","0,2","1,0","1,1","1,2","2,0","2,1","2,2","3,0","3,1","3,2",MEI_AVG,AAO_AVG,ITCZInt_AVG
JFM80,0,0,6,0,5,9,8,5,13,12,20,13,0.652,-0.95,6.093
FMA80,0,2,6,4,6,13,7,5,9,11,16,11,0.729,-1.491,6.427
MAM80,8,9,10,7,3,12,6,6,8,8,8,7,0.853,-1.324,6.113
AMJ80,15,15,9,9,6,13,6,6,5,6,0,1,0.925,-0.754,5.903
MJJ80,15,15,20,6,7,16,3,5,5,0,0,0,0.872,-0.712,6.273


#### extract values at each node of the SOM frequency dataset

In [6]:
nodes = ['0,0','0,1','0,2','1,0','1,1','1,2','2,0','2,1','2,2','3,0','3,1'
         ,'3,2','MEI_AVG','AAO_AVG','ITCZInt_AVG']

## Separating out the nodes
x = df.loc[:, nodes].values

# Standardizing the nodes
x = StandardScaler().fit_transform(x)

In [7]:
type (x); x.shape      #x type is a numpy array

(406, 15)

#### perform factor analysis with varimax rotation

In [8]:
n_comps = 3

In [66]:
fan = FactorAnalysis(n_components=n_comps, rotation='varimax').fit(x)

In [67]:
fan.get_covariance();

In [68]:
fan.components_.T
-0.81229198 -0.05672762  0.01706003 

array([[-0.81502447, -0.03480992,  0.02812297],
       [-0.82913934,  0.11197453,  0.19759255],
       [-0.68419625,  0.36594673,  0.29815912],
       [-0.6794119 ,  0.05111988, -0.28662233],
       [-0.30974335,  0.4438227 , -0.29076044],
       [-0.00837923,  0.72039307,  0.03095742],
       [-0.11678019, -0.09102721, -0.68182675],
       [ 0.4544653 ,  0.09565417, -0.4286571 ],
       [ 0.66459206,  0.2858315 ,  0.19576088],
       [ 0.54842232, -0.55911666, -0.10681822],
       [ 0.73261859, -0.48129461,  0.19716769],
       [ 0.74314022, -0.2576395 ,  0.29560811],
       [-0.23096913,  0.54929702,  0.0923048 ],
       [ 0.06120481, -0.06960255,  0.23025851],
       [ 0.09859103, -0.19750385, -0.0816199 ]])

In [69]:
fan_scores = fan.transform(x)

In [70]:
fan_scores

array([[ 1.1561915 , -0.06021044,  0.42288537],
       [ 0.84491527,  0.17611087,  0.12229212],
       [-0.00716542,  0.13258153,  0.31981211],
       ...,
       [-1.52939638, -0.68713567, -0.45890216],
       [-0.86959885, -0.47687711, -0.38361965],
       [ 0.24000241,  0.28792126, -0.3996681 ]])

In [71]:
df_scores = pd.DataFrame(fan_scores,columns = ['RC1', 'RC2','RC3'])
df_scores

Unnamed: 0,RC1,RC2,RC3
0,1.156191,-0.060210,0.422885
1,0.844915,0.176111,0.122292
2,-0.007165,0.132582,0.319812
3,-0.766558,0.268395,0.047018
4,-0.901574,0.825561,0.736015
5,-0.679149,1.189052,0.916992
6,-0.630750,0.894283,0.483377
7,-1.006724,0.040748,0.280071
8,-0.740854,-0.348403,-1.077929
9,-0.009384,-0.601458,-0.970929


#### sanity check --- perform pca from first principles using numpy

In [15]:
eigvals, eigvecs = np.linalg.eig(np.cov(x.T))
factors = (np.sqrt(eigvals) * eigvecs)[:, :n_comps]
print(factors)

[[-0.77793312  0.30714889  0.25604422]
 [-0.82045337  0.07283559  0.35008235]
 [-0.77649269 -0.28713084  0.30414943]
 [-0.70880852  0.37084436 -0.0736359 ]
 [-0.50491111 -0.09636458 -0.52420846]
 [-0.28842885 -0.70739083 -0.33173586]
 [-0.11407354  0.55018214 -0.60418794]
 [ 0.4170387   0.02530657 -0.65685081]
 [ 0.53831704 -0.62420616 -0.09162505]
 [ 0.75550438  0.35455099  0.06331131]
 [ 0.89577883  0.06770724  0.24102826]
 [ 0.81625093 -0.17773774  0.21855766]
 [-0.46026671 -0.52954658 -0.19856932]
 [ 0.14600456 -0.11260309  0.4030715 ]
 [ 0.20441316  0.31022392 -0.13708812]]


In [72]:
rotated = fa._ortho_rotation(factors[:, :n_comps], method='varimax').T
rotated

array([[-0.87342606, -0.01131561,  0.04558394],
       [-0.84409682, -0.17208279,  0.24268181],
       [-0.65391979, -0.45054467,  0.38378419],
       [-0.75328167, -0.0719827 , -0.26970821],
       [-0.27719737, -0.5339026 , -0.42086281],
       [ 0.10277552, -0.82451238,  0.05708062],
       [-0.16124463,  0.11346991, -0.80117475],
       [ 0.52630313, -0.07048493, -0.56925528],
       [ 0.74232505, -0.27439412,  0.24794173],
       [ 0.51601767,  0.65136374, -0.09976291],
       [ 0.70649641,  0.57043296,  0.20141416],
       [ 0.73688283,  0.3337475 ,  0.30205965],
       [-0.15222862, -0.70875896,  0.07862562],
       [ 0.07086393,  0.15382973,  0.4096073 ],
       [ 0.09562399,  0.27590965, -0.26748213]])

#### create a dataframe from rotated principal compnents

In [17]:
principalDf = pd.DataFrame(data = rotated, columns = ['RC1', 'RC2','RC3'])

In [18]:
principalDf.rename(index={0:'0,0',1:'0,1',2:'0,2',3:'1,0',4:'1,1',5:'1,2',
                         6:'2,0',7:'2,1',8:'2,2',9:'3,0',10:'3,1',11:'3,2',
                         12:'MEI_AVG',13:'AAO_AVG',14:'ITCZInt_AVG'}, inplace=True)


In [19]:
principalDf

Unnamed: 0,RC1,RC2,RC3
00,-0.873426,-0.011316,0.045584
01,-0.844097,-0.172083,0.242682
02,-0.65392,-0.450545,0.383784
10,-0.753282,-0.071983,-0.269708
11,-0.277197,-0.533903,-0.420863
12,0.102776,-0.824512,0.057081
20,-0.161245,0.11347,-0.801175
21,0.526303,-0.070485,-0.569255
22,0.742325,-0.274394,0.247942
30,0.516018,0.651364,-0.099763


#### sanity check----verifying PCA with R

In [74]:
%Rpush factors
%Rpush x
%Rpush n_comps

In [75]:
%%R

a = varimax(factors, normalize=T, eps=1e-6)

In [78]:
%%R
unclass(a$loadings)

             [,1]        [,2]        [,3]
 [1,] -0.86984524 -0.08660903  0.03077592
 [2,] -0.82815218 -0.27418523  0.19998206
 [3,] -0.61670302 -0.55601030  0.29735806
 [4,] -0.74480301 -0.08624015 -0.28842404
 [5,] -0.23336861 -0.47928070 -0.50482173
 [6,]  0.16774412 -0.81239927 -0.07423613
 [7,] -0.16855012  0.22785739 -0.77487296
 [8,]  0.53107547  0.06322529 -0.56566430
 [9,]  0.76137772 -0.25191756  0.21131783
[10,]  0.46289237  0.69717920  0.01316891
[11,]  0.65874325  0.58399639  0.30017353
[12,]  0.70765234  0.33731170  0.36203966
[13,] -0.09566693 -0.72186866 -0.03805631
[14,]  0.05783814  0.09103560  0.42991940
[15,]  0.07384082  0.32192952 -0.21846211


In [None]:
#help(FactorAnalysis);

In [2]:
#%%R

#print(psych::principal(X, rotate="varimax", nfactors=3, scores=TRUE,
#                       covar=TRUE)$loadings)

In [1]:
#%%R
#install.packages('psych',dep =T)