In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [2]:
 
def preprocess(filename):
    """Preprocess NCDC weather data"""

    fields = ['STN', 'WBAN', 'YEARMODA', 'TEMP', 'TEMP_count', 'DEWP', 'DEWP_count', 'SLP', 'SLP_count', 'STP', 'STP_count', 'VISIB', 'VISIB_count', 'WDSP', 'WDSP_count', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP', 'FRSHTT']
     
    df = pd.read_csv(filename, 
                  sep=r'\s+', 
                  names=fields, 
                  header=0, 
                  parse_dates=['YEARMODA'], 
                  na_values={'TEMP':[9999.9], 
                             'DEWP':[9999.9], 
                             'SLP':[9999.9], 
                             'STP':[9999.9], 
                             'VISIB':[999.9], 
                             'WDSP':[999.9], 
                             'MXSPD':[999.9], 
                             'GUST':[999.9], 
                             'MAX':['9999.9'], # doesn't matter whether float or str
                             'MIN':['9999.9'], 
                             'PRCP':['99.99'],
                             'SNDP':[999.9]}
                 )
    flagged = df.copy()
 
    def strip_flag(x):
        if type(x) is float:
            return x
        elif type(x) is str:
            return float(x[:-1]) if '*' in x else float(x)
    def extract_flag(x):
        if type(x) is float:
            return False
        elif type(x) is str:
            return True if '*' in x else False
     
    flagged['MAX'] = df['MAX'].map(strip_flag)
    flagged['MAX_flag'] = df['MAX'].map(extract_flag)
    flagged['MIN'] = df['MIN'].map(strip_flag)
    flagged['MIN_flag'] = df['MIN'].map(extract_flag)
     
    flagged['PRCP'] = df['PRCP'].map(lambda x: float(x[:-1]) if type(x) is str else x)
    PRCP_flag = df['PRCP'].map(lambda x: x[-1] if type(x) is str else x)
    PRCP_dummies = pd.get_dummies(PRCP_flag).add_prefix('PRCP_')
    preprocessed = flagged.join(PRCP_dummies)
     
    return preprocessed


In [3]:
print "processing 1..."
df1 = preprocess('CA_1981-1985.txt')
print "processing 2..."
df2 = preprocess('CA_1985-1989.txt')
print "processing 3..."
df3 = preprocess('CA_1989-1993.txt')
print "processing 4..."
df4 = preprocess('CA_1993-1997.txt')
print "processing 5..."
df5 = preprocess('CA_1997-2001.txt')
print "processing 6..."
df6 = preprocess('CA_2001-2005.txt')
print "processing 7..."
df7 = preprocess('CA_2005-2009.txt')
print "processing 8..."
df8 = preprocess('CA_2009-2015.txt')

processing 1...
processing 2...
processing 3...
processing 4...
processing 5...
processing 6...
processing 7...
processing 8...


In [36]:
frames = [ df1, df2, df3, df4, df5, df6, df7, df8]
df = pd.concat(frames)
df_processed = df[['YEARMODA','TEMP', 'DEWP', 'PRCP']].dropna(axis=0)
data = df_processed['TEMP','DEWP','PRCP'].values
data_normed = data / np.linalg.norm(data)
data

KeyError: ('TEMP', 'DEWP', 'PRCP')

In [14]:
pca = PCA(n_components=3) # Want to keep 2 components for 3 dimentional data
pca.fit(data_normed)

PCA(copy=True, n_components=3, whiten=False)

In [23]:
first_pc = pca.components_[0]
second_pc = pca.components_[1]
third_pc = pca.components_[2]
# the eigenvalues
#print pca.explained_variance_ratio_
#print first_pc
#print second_pc


In [16]:
transformed_data = pca.transform(data_normed)
print transformed_data

[[ -1.60515851e-04   1.14305849e-04  -7.64763877e-07]
 [ -1.05941171e-04   1.32325184e-04   1.76727790e-07]
 [ -3.35125308e-05   1.09922443e-04  -6.42281069e-07]
 ..., 
 [ -5.11247304e-05  -2.90571667e-04   3.85158621e-07]
 [ -3.05217066e-05  -2.39942057e-04   2.71342814e-07]
 [  2.54569431e-05  -2.41830564e-04   3.25215590e-07]]


In [33]:
for ii, jj in zip(transformed_data[:20],data[:20]):
    plt.scatter(first_pc[0]*ii[0], first_pc[1]*ii[0], third_pc[2]*ii[0], color="r")
    plt.scatter(second_pc[0]*ii[1], second_pc[1]*ii[1],third_pc[2]*ii[1], color="c")
    plt.scatter(third_pc[0]*ii[2],  third_pc[1]*ii[2], third_pc[2]*ii[2], color="g")
    plt.scatter( jj[0], jj[1], color="b")

TypeError: unbound method scatter() must be called with Axes3D instance as first argument (got float64 instance instead)