In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
df_train = pd.read_csv('data/train.csv')
df_train = df_train[['id', 'reading score', 'writing score', 'math score']]

In [13]:
ip_mat = df_train.values[:, 1:]
ip_mat.dtype = np.float
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 4 columns):
id               750 non-null int64
reading score    750 non-null int64
writing score    750 non-null int64
math score       750 non-null int64
dtypes: int64(4)
memory usage: 23.5 KB


In [4]:

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def step_fuc(x):
    return 1 if x > 0 else 0 if x == 0 else -1

In [41]:
class PCA(object):
    
    
    def fit(self, X, n_components):
        n_samples, n_features = X.shape
        
        self.mean_ = np.mean(X, axis=0)
        X -= self.mean_
        
        #X = U*S*V^T
        #注意下方S是上方S主对角线上元素
        U, S, V = np.linalg.svd(X, full_matrices=False)
        
        explained_variance_ = (S ** 2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = S.copy()
        
        components_ = V
        
        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = singular_values_[:n_components]

        return U, S, V
    
    
    def transform(self, X):
        
        X -= self.mean_
        
        print('transform: X.shape is {0} \ncomponents shape is {1}'.format(X.shape, self.components_.shape))
        red_x = np.dot(X, self.components_.T)
        
        return red_x
    
    def fit_transform(self, X, n_components):
        
        U, S, V = self.fit(X, n_components)
        U = U[:, :self.n_components_]
        print('fit_transform U S V shape is {0}, {1}, {2}'.format(U.shape, S.shape, V.shape))
        
        #red_x = X * V = U*S*V^T  * V = U*S
        red_x = U * S[:n_components]
        
        return red_x
    
    
    def decomposition_cov(self, X, n_components):
        
        x_mean_ = np.mean(X, axis=0)
        X -= x_mean_
        
        x_cov = np.cov(X, rowvar=0)
        
        #对比array和mat的乘法
        eig_vals, eig_vects = np.linalg.eig(np.mat(x_cov))
        
        eig_vals_idx = np.argsort(eig_vals)
        eig_vals_idx = eig_vals_idx[:-(n_components+1):-1]
        
        self.eig_vals  = eig_vals[eig_vals_idx]
        self.eig_vects = eig_vects[:, eig_vals_idx]
        
        red_x = X * self.eig_vects
        
        return red_x
        
        
        

In [42]:
#cmp
from sklearn.decomposition import PCA as skl_PCA


pca = PCA()
skl_pca = skl_PCA()

In [43]:
test_components = 2

x_1 = pca.fit_transform(ip_mat, test_components)

pca.fit(ip_mat, test_components)
x_2 = pca.transform(ip_mat)

x_3 = pca.decomposition_cov(ip_mat, test_components)

x_4 = skl_pca.fit_transform(ip_mat, test_components)

skl_pca.fit(ip_mat, test_components)
x_5 = skl_pca.transform(ip_mat)

fit_transform U S V shape is (750, 2), (3,), (3, 3)
transform: X.shape is (750, 3) 
components shape is (2, 3)


  app.launch_new_instance()
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var


In [44]:
x_1, x_2, x_3, x_4, x_5

(array([[-9.9e-323,  2.0e-323],
        [-4.4e-323, -1.5e-323],
        [ 5.9e-323, -1.5e-323],
        ...,
        [ 4.4e-323,  3.0e-323],
        [ 9.4e-323,  4.0e-323],
        [ 3.0e-323,  9.9e-324]]), array([[-9.9e-323,  2.0e-323],
        [-4.4e-323, -1.5e-323],
        [ 5.4e-323, -1.5e-323],
        ...,
        [ 4.4e-323,  2.5e-323],
        [ 8.9e-323,  4.0e-323],
        [ 3.0e-323,  9.9e-324]]), matrix([[-4.0e-323, -7.4e-323],
         [-4.0e-323, -3.0e-323],
         [ 2.0e-323,  4.9e-323],
         ...,
         [ 4.9e-323,  2.0e-323],
         [ 8.4e-323,  2.5e-323],
         [ 2.5e-323,  0.0e+000]]), array([[ 9.9e-323,  2.0e-323, -9.9e-324],
        [ 4.4e-323, -1.5e-323, -1.5e-323],
        [-5.9e-323, -1.5e-323,  9.9e-324],
        ...,
        [-4.4e-323,  3.0e-323,  9.9e-324],
        [-9.4e-323,  4.0e-323, -2.0e-323],
        [-3.0e-323,  9.9e-324, -1.5e-323]]), array([[ 9.9e-323,  2.0e-323, -4.9e-324],
        [ 4.4e-323, -1.5e-323, -1.5e-323],
        [-5.4e-32