In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [5]:
def pca(data, component_num = 3):
  z = np.dot(data.T, data)
  eigen_vals, eigen_vecs = np.linalg.eigh(z)
  eigen_vals = np.abs(eigen_vals)
  eigen_pairs = [(eigen_vals[i], eigen_vecs[:,i]) for i in range(eigen_vals.shape[0])]

  # Sort eigen value and vector pairs in descent order
  eigen_pairs.sort(key = lambda item: item[0], reverse = True)

  #  Seperate eigenvalues and eigenvectors
  sorted_eigen_vals = []
  sorted_eigen_vecs = []
  for pair in eigen_pairs:
    sorted_eigen_vals.append(pair[0])
    sorted_eigen_vecs.append(pair[1])

  # Calculate explain ratio for top pairs
  total_eigen_val = sum(sorted_eigen_vals)
  explain_ratios = np.cumsum(sorted_eigen_vals) / total_eigen_val
  
  top_eigen_vals = sorted_eigen_vals[:component_num]
  top_eigen_vecs = sorted_eigen_vecs[:component_num]

  # print(top_eigen_vals)
  # print(top_eigen_vecs)
  # print(explain_ratios)

  return top_eigen_vals, top_eigen_vecs, explain_ratios

In [6]:
PCA_comps = 1

# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
# X = pd.DataFrame(data=X, columns=init_data['feature_names'])
# y = pd.DataFrame(data=y, columns=['label'])

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# PCA on dataset
eigen_vals, eigen_vecs, exp_rs = pca(X_train, PCA_comps)
X_train_pca = np.dot(X_train, np.array(eigen_vecs).T)
X_test_pca = np.dot(X_test, np.array(eigen_vecs).T)

# Train a RandomForestClassifier as model 
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)

forest.fit(X_train_pca, y_train)

y_pred = forest.predict(X_test_pca)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_pred)/PCA_comps))

Accuracy: 0.85
Accuracy per feature: 0.85
