In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [2]:
d = pd.read_pickle("hourlies.pickle")

In [3]:
X = d[['ELEVATION', 'LATITUDE', 'LONGITUDE', 'HOURLYDRYBULBTEMPF', 'HOURLYDewPointTempF', 'HOURLYRelativeHumidity', 
               'HOURLYWindSpeed', 'HOURLYStationPressure','HOURLYAltimeterSetting', 'HOURLYPrecip']]

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
scaler = StandardScaler(with_mean = True)
scaler.fit(X)
scaled_X = scaler.transform(X)

In [6]:
from sklearn.decomposition import PCA
pca = PCA(n_components=9)
pca.fit(scaled_X)

PCA(copy=True, iterated_power='auto', n_components=9, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [7]:
print(pca.components_)

[[ 0.49878342  0.21179254 -0.3729226  -0.26168828 -0.41597062 -0.25307404
   0.05677935 -0.49564996  0.11093477 -0.0230144 ]
 [-0.23329209  0.41716736  0.24718263 -0.61469122 -0.39859759  0.34430634
  -0.01068428  0.23083    -0.04099534 -0.00987356]
 [-0.06079436  0.26289867  0.00872853  0.12232656  0.01405906 -0.17949453
   0.67349066 -0.01077611 -0.65125172  0.04356395]
 [ 0.17354078  0.10957605 -0.18442262 -0.01518162  0.23710192  0.47484784
  -0.26910927 -0.20884843 -0.33486055  0.64324774]
 [-0.11482138 -0.14682929  0.14968165 -0.0318383  -0.21521386 -0.34502466
   0.24484057  0.15140442  0.33948782  0.76049835]
 [-0.23452899  0.67299916 -0.28756621  0.26987288  0.06397235 -0.36617746
  -0.37442781  0.22536694  0.05611964  0.06502557]
 [-0.1260936   0.08058605 -0.59796206 -0.0303189   0.20041793  0.37937338
   0.47957947  0.16234285  0.42141649 -0.0330665 ]
 [ 0.28927295  0.46778849  0.55021288  0.19163223  0.3030902   0.14069877
   0.20101701 -0.24104858  0.38450026 -0.00504466]


In [8]:
print(pca.explained_variance_)

[3.08786486 1.83719475 1.31978578 1.02060214 0.98498906 0.71679867
 0.61486815 0.40400376 0.01353633]


In [9]:
print(pca.explained_variance_ratio_)


[0.30878618 0.18371929 0.13197845 0.10206011 0.09849881 0.0716798
 0.06148675 0.04040034 0.00135363]


In [10]:
np.cumsum(pca.explained_variance_ratio_)

array([0.30878618, 0.49250547, 0.62448391, 0.72654403, 0.82504283,
       0.89672263, 0.95820938, 0.99860972, 0.99996335])

These results were not very useful, as it would take 6 principal components to explain 90% of the variance.  Let's try the process for the locations.

In [11]:
location = np.unique(d.index)

sum_store = []

for i in location:
    X = d.loc[i, ['ELEVATION', 'LATITUDE', 'LONGITUDE', 'HOURLYDRYBULBTEMPF', 'HOURLYDewPointTempF', 'HOURLYRelativeHumidity', 
               'HOURLYWindSpeed', 'HOURLYStationPressure','HOURLYAltimeterSetting', 'HOURLYPrecip']]
    scaler = StandardScaler(with_mean = True)
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    pca = PCA(n_components=9)
    pca.fit(scaled_X)
    sum_store.append(np.cumsum(pca.explained_variance_ratio_))

In [12]:
print(sum_store)

[array([0.35353532, 0.61143308, 0.77686478, 0.91935715, 0.99970785,
       0.99998348, 1.        , 1.        , 1.        ]), array([0.34916364, 0.6372425 , 0.78312255, 0.92264098, 0.99723078,
       0.999977  , 1.        , 1.        , 1.        ]), array([0.31499306, 0.53995992, 0.72913083, 0.87157379, 0.99472329,
       0.99996488, 1.        , 1.        , 1.        ]), array([0.3489858 , 0.61188946, 0.75863317, 0.90110419, 0.99625897,
       0.99997865, 1.        , 1.        , 1.        ]), array([0.35380837, 0.59057915, 0.7759226 , 0.91840989, 0.99963806,
       0.99998621, 1.        , 1.        , 1.        ]), array([0.3263422 , 0.60727135, 0.77277095, 0.91542743, 0.99973578,
       0.99999052, 1.        , 1.        , 1.        ]), array([0.38849234, 0.59531337, 0.76580338, 0.90814349, 0.99944259,
       0.99999908, 1.        , 1.        , 1.        ]), array([0.36006419, 0.60765401, 0.78117937, 0.92357445, 0.99969323,
       0.99998858, 1.        , 1.        , 1.        ]), array([

## Conclusion

Although still 4 principal components were required to explain around 90% of the variance, this result was slightly improved when sorted by location.