In [1]:
# Importing modules
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from statistics import mean, stdev

<div class="alert alert-block alert-success">
<b>Part one</b> </div>

In [3]:
labels = ['Full set', '95% variance set', '25 PCA set', '16 PCA set', '9 PCA set', '4 PCA set']

In [5]:
def TransformDataForTable(accuracies):
    result = []
    result_average = []
    result_deviation = []
    
    for accuracy in accuracies:
        result_average.append(round(mean(accuracy), 2))
        result_deviation.append(round(stdev(accuracy), 2))
        
    
    result.append(result_average)
    result.append(result_deviation)
    
    transposed = np.transpose(result)
    
    return transposed

In [6]:
def ReportConstructor(accuracies):
    df1 = pd.DataFrame( TransformDataForTable(accuracies),
                        index = pd.Index(labels),
                        columns = ['Average Accuracy', 'Standard Deviation'])
    print(df1)

In [7]:
# Initializing arrays 
accuracy_full = []
accuracy_95p = []
accuracy_25 = []
accuracy_16 = []
accuracy_9 = []
accuracy_4 = []

In [8]:
# In this part of the program we will read the data and separate it into two variables (X - protein readings, Y - labeled cancer type) 
dataset = pd.read_csv('breast_cancer_genomic.csv')
dataset.pop('CLID')

y = dataset['Class'].values
dataset.pop('Class')
x = dataset.values

NameError: name 'pd' is not defined

In [8]:
# In this part we will initilaze the PCA classifiers that are needed for the next steps in our program
pca_95p = PCA(n_components = 0.95)
pca_25 = PCA(n_components = 25)
pca_16 = PCA(n_components = 16)
pca_9 = PCA(n_components = 9)
pca_4 = PCA(n_components = 4)

In [9]:
# Initializing RandomForestTree Classifier and GridSearchCV 
rf_clf_params = [{'n_estimators': [10, 50, 100, 200, 400, 800, 1000]}, {'criterion': ['gini', 'entropy']}]

rf_clf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf_clf, param_grid = rf_clf_params, scoring='accuracy')

In [10]:
# In this part of the program we will execute the 10 runs that are needed in order to calculate the model accuracy
# Before we go into the section with the runs we first must clear our lists
accuracy_full.clear()
accuracy_95p.clear()
accuracy_25.clear()
accuracy_16.clear()
accuracy_9.clear()
accuracy_4.clear()

for i in range(0, 10):
    
    # Modify how PCA works on X as it has to be done before the train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.2)
    
    # Performing all of the necessary data reductions
    x_train_95p = pca_95p.fit_transform(x_train)
    x_test_95p = pca_95p.transform(x_test)
    x_train_25 = pca_25.fit_transform(x_train)
    x_test_25 = pca_25.transform(x_test)
    x_train_16 = pca_16.fit_transform(x_train)
    x_test_16 = pca_16.transform(x_test)
    x_train_9 = pca_9.fit_transform(x_train)
    x_test_9 = pca_9.transform(x_test)
    x_train_4 = pca_4.fit_transform(x_train)
    x_test_4 = pca_4.transform(x_test)
    
    best_estimator_full = grid_search.fit(x_train, y_train)
    rf_best_full = best_estimator_full.best_estimator_
    rf_best_full.fit(x_train, y_train)
    predict_full = rf_best_full.predict(x_test)
    accuracy_full.append(rf_best_full.score(x_test, y_test))
    
    best_estimator_95p = grid_search.fit(x_train_95p, y_train)
    rf_best_95p = best_estimator_95p.best_estimator_
    rf_best_95p.fit(x_train_95p, y_train)
    predict_95p = rf_best_95p.predict(x_test_95p)
    accuracy_95p.append(rf_best_95p.score(x_test_95p, y_test))
    
    best_estimator_25 = grid_search.fit(x_train_25, y_train)
    rf_best_25 = best_estimator_25.best_estimator_
    rf_best_25.fit(x_train_25, y_train)
    predict_25 = rf_best_25.predict(x_test_25)
    accuracy_25.append(rf_best_25.score(x_test_25, y_test))
    
    best_estimator_16 = grid_search.fit(x_train_16, y_train)
    rf_best_16 = best_estimator_16.best_estimator_
    rf_best_16.fit(x_train_16, y_train)
    predict_16 = rf_best_16.predict(x_test_16)
    accuracy_16.append(rf_best_16.score(x_test_16, y_test))
    
    best_estimator_9 = grid_search.fit(x_train_9, y_train)
    rf_best_9 = best_estimator_9.best_estimator_
    rf_best_9.fit(x_train_9, y_train)
    predict_9 = rf_best_9.predict(x_test_9)
    accuracy_9.append(rf_best_9.score(x_test_9, y_test))
    
    best_estimator_4 = grid_search.fit(x_train_4, y_train)
    rf_best_4 = best_estimator_4.best_estimator_
    rf_best_4.fit(x_train_4, y_train)
    predict_4 = rf_best_4.predict(x_test_4)
    accuracy_4.append(rf_best_4.score(x_test_4, y_test))

In [11]:
accuracies = []
accuracies.append(accuracy_full)
accuracies.append(accuracy_95p)
accuracies.append(accuracy_25)
accuracies.append(accuracy_16)
accuracies.append(accuracy_9)
accuracies.append(accuracy_4)

In [12]:
ReportConstructor(accuracies)

                  Average Accuracy  Standard Deviation
Full set                      0.81                0.05
95% variance set              0.81                0.04
25 PCA set                    0.87                0.05
16 PCA set                    0.90                0.03
9 PCA set                     0.90                0.02
4 PCA set                     0.90                0.05


In [27]:
## PART TWO ##

In [17]:
# First we need to transform the data
for i in range(0, y.size):
    if(y[i] == 4):
        y[i] = 1
    else:
        y[i] = 0