In [547]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from datetime import datetime
from sklearn.datasets import load_breast_cancer
main_start = datetime.now()
%matplotlib inline

In [548]:
raw_data = load_breast_cancer()

In [549]:
data = np.c_[raw_data.data, raw_data.target]
columns = np.append(raw_data.feature_names, ["target"])
df = pd.DataFrame(data, columns=columns)

df.head(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,0.0
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,0.0
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,0.0
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,0.0
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,0.0


In [550]:
df.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

In [551]:
# Benign = 1.0, Malignant = 0.0
df['target'].value_counts()

1.0    357
0.0    212
Name: target, dtype: int64

In [552]:
X = df.drop(['target'], 1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(y_train.value_counts())
print('\n')
print(y_test.value_counts())


1.0    236
0.0    145
Name: target, dtype: int64


1.0    121
0.0     67
Name: target, dtype: int64


In [553]:
#Set up our first X using PCA
pca = PCA(n_components=6)
X_std_pca = pca.fit_transform(X_train)
X1 = pd.DataFrame(X_std_pca)

#Now for the Test
pca = PCA(n_components=6)
X_std_pca = pca.fit_transform(X_test)
X1_test = pd.DataFrame(X_std_pca)


In [554]:
start = datetime.now()
#Use Lasso/RFE with Cross Validation to get the best features for our second X
lr = LogisticRegression(C=1, penalty='l1')

#Set up our X,y
X = X_train
y = y_train

#Set up our Recursive Feature Elimination
rfe = RFECV(lr,cv=5) 
fit = rfe.fit(X,y)
result_RFE = pd.DataFrame(list(zip(X.head(0), rfe.ranking_, rfe.support_)),
                          columns=['Features','Ranking','Support'] ) 
print(result_RFE.sort_values('Ranking'))
print('It took the following time to complete this task:', datetime.now() - start)

                   Features  Ranking  Support
0               mean radius        1     True
21            worst texture        1     True
11            texture error        1     True
13               area error        1     True
26          worst concavity        1     True
1              mean texture        1     True
2            mean perimeter        1     True
22          worst perimeter        2    False
3                 mean area        3    False
23               worst area        4    False
25        worst compactness        5    False
18           symmetry error        6    False
17     concave points error        7    False
16          concavity error        8    False
7       mean concave points        9    False
6            mean concavity       10    False
5          mean compactness       11    False
4           mean smoothness       12    False
19  fractal dimension error       13    False
8             mean symmetry       14    False
20             worst radius       

In [555]:
true_values = result_RFE[result_RFE.Support == True]
feature_list = []
for g in true_values['Features']:
    feature_list.append(g)
X2 = X_train[feature_list]
X2_test = X_test[feature_list]

In [556]:
# Let's use Random Forest to select features down to 30.
start = datetime.now()
X = X_train
y = y_train
clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
df2 = X.columns.get_values()
feat_labels = df2.tolist()

# Train the classifier
clf.fit(X, y)
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
for feature in zip(X, clf.feature_importances_):
    print(feature)

print('It took the following time to complete this task:', datetime.now() - start)

('mean radius', 0.037276174461743777)
('mean texture', 0.013247063371622285)
('mean perimeter', 0.048953902560322587)
('mean area', 0.036048174413680072)
('mean smoothness', 0.0055517841731483651)
('mean compactness', 0.014192584400808787)
('mean concavity', 0.067550378387953885)
('mean concave points', 0.12839577373170524)
('mean symmetry', 0.0037155100099138597)
('mean fractal dimension', 0.0047218697364518952)
('radius error', 0.017646293451140868)
('texture error', 0.0043445402294177982)
('perimeter error', 0.012159365770607826)
('area error', 0.027468926014620119)
('smoothness error', 0.0052989711277984484)
('compactness error', 0.0047002931921151583)
('concavity error', 0.0065176802329209981)
('concave points error', 0.0041956683448790702)
('symmetry error', 0.0049424005406347431)
('fractal dimension error', 0.0056106882339846467)
('worst radius', 0.091631584002856564)
('worst texture', 0.017915987085504213)
('worst perimeter', 0.11349576554941122)
('worst area', 0.09155775956712

In [557]:
start = datetime.now()
sfm = SelectFromModel(clf, threshold=0.05)
rfcfeature_list =[]
# Train the selector
sfm.fit(X, y)
for feature_list_index in sfm.get_support(indices=True):
    rfcfeature_list.append(feat_labels[feature_list_index])
    print(feat_labels[feature_list_index])
X3 = X_train[rfcfeature_list]
X3_test = X_test[rfcfeature_list]
    
print('Number of features in this list: {}'.format(len(rfcfeature_list)))
print('It took the following time to complete this task:', datetime.now() - start)

mean concavity
mean concave points
worst radius
worst perimeter
worst area
worst concave points
Number of features in this list: 6
It took the following time to complete this task: 0:00:02.016806


In [558]:
#Make a list to store our results for a comparison at the end
results = pd.DataFrame(index=range(21))
results['Model'] = ['Logistic Regression',
                    'Logistic Regression',
                    'Logistic Regression',
                    'Lasso Regression',
                    'Lasso Regression',
                    'Lasso Regression',
                    'Ridge Regression',
                    'Ridge Regression',
                    'Ridge Regression',
                    'KNN', 'KNN', 'KNN',
                    'SVC', 'SVC', 'SVC',
                    'Random Forest',
                    'Random Forest',
                    'Random Forest',
                    'Gradient Booster',
                    'Gradient Booster',
                    'Gradient Booster',
                    ]
results['Feature Selection'] = ['PCA', 'RFECV', 'Random Forest',
                               'PCA', 'RFECV', 'Random Forest',
                               'PCA', 'RFECV', 'Random Forest',
                               'PCA', 'RFECV', 'Random Forest',
                               'PCA', 'RFECV', 'Random Forest',
                               'PCA', 'RFECV', 'Random Forest',
                               'PCA', 'RFECV', 'Random Forest']


Train_Score = []
Test_Score = []
Cross_Validation = []
fold1 = []
fold2 = []
fold3 = []
fold4 = []
fold5 = []

In [559]:
start = datetime.now()
#Let's compare our features in various models starting with Logistic Regression
lr = LogisticRegression(C=9e9)
#Fit the model
lr.fit(X1, y)
score1 = lr.score(X1, y)
Train_Score.append(score1)
lr.fit(X1_test, y_test)
score2 = lr.score(X1_test, y_test)
Test_Score.append(score2)
lr.fit(X2, y)
score3 = lr.score(X2, y)
Train_Score.append(score3)
lr.fit(X2_test, y_test)
score4 = lr.score(X2_test, y_test)
Test_Score.append(score4)
lr.fit(X3, y)
score5 = lr.score(X3, y)
Train_Score.append(score5)
lr.fit(X3_test, y_test)
score6 = lr.score(X3_test, y_test)
Test_Score.append(score6)

# Inspect the results.
print('\nR-squared for the training set with PCA:')
print(score1)
print('\nR-squared for the test set with PCA:')
print(score2)
Cross_Validation0 = cross_val_score(lr, X1, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with RFECV generated features:')
print(score3)
print('\nR-squared for the test set with RFECV generated features:')
print(score4)
Cross_Validation0 = cross_val_score(lr, X2, y, cv=5)
print('\nCross Validation Score with 5 folds with RFECV generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with Random Forest generated features:')
print(score5)
print('\nR-squared for the test set with Random Forest generated features:')
print(score6)
Cross_Validation0 = cross_val_score(lr, X3, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the training set with PCA:
0.94750656168

R-squared for the test set with PCA:
0.994680851064

Cross Validation Score with 5 folds with PCA:
[ 0.93506494  0.93421053  0.97368421  0.92105263  0.92105263]

R-squared for the training set with RFECV generated features:
0.971128608924

R-squared for the test set with RFECV generated features:
0.968085106383

Cross Validation Score with 5 folds with RFECV generated features:
[ 0.97402597  0.96052632  0.96052632  0.97368421  0.92105263]

R-squared for the training set with Random Forest generated features:
0.944881889764

R-squared for the test set with Random Forest generated features:
0.984042553191

Cross Validation Score with 5 folds with PCA:
[ 0.94805195  0.93421053  0.93421053  0.93421053  0.92105263]
It took the following time to complete this task: 0:00:00.083768


In [560]:
start = datetime.now()
#Let's pop those into the model
lr = LogisticRegression(C=1, penalty='l1')
#Fit the model
lr.fit(X1, y)
score1 = lr.score(X1, y)
Train_Score.append(score1)
lr.fit(X1_test, y_test)
score2 = lr.score(X1_test, y_test)
Test_Score.append(score2)
lr.fit(X2, y)
score3 = lr.score(X2, y)
Train_Score.append(score3)
lr.fit(X2_test, y_test)
score4 = lr.score(X2_test, y_test)
Test_Score.append(score4)
lr.fit(X3, y)
score5 = lr.score(X3, y)
Train_Score.append(score5)
lr.fit(X3_test, y_test)
score6 = lr.score(X3_test, y_test)
Test_Score.append(score6)

# Inspect the results.
print('\nR-squared for the training set with PCA:')
print(score1)
print('\nR-squared for the test set with PCA:')
print(score2)
Cross_Validation0 = cross_val_score(lr, X1, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with RFECV generated features:')
print(score3)
print('\nR-squared for the test set with RFECV generated features:')
print(score4)
Cross_Validation0 = cross_val_score(lr, X2, y, cv=5)
print('\nCross Validation Score with 5 folds with RFECV generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with Random Forest generated features:')
print(score5)
print('\nR-squared for the test set with Random Forest generated features:')
print(score6)
Cross_Validation0 = cross_val_score(lr, X3, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the training set with PCA:
0.94750656168

R-squared for the test set with PCA:
0.978723404255

Cross Validation Score with 5 folds with PCA:
[ 0.94805195  0.93421053  0.97368421  0.93421053  0.90789474]

R-squared for the training set with RFECV generated features:
0.96062992126

R-squared for the test set with RFECV generated features:
0.962765957447

Cross Validation Score with 5 folds with RFECV generated features:
[ 0.93506494  0.97368421  0.98684211  0.97368421  0.88157895]

R-squared for the training set with Random Forest generated features:
0.926509186352

R-squared for the test set with Random Forest generated features:
0.952127659574

Cross Validation Score with 5 folds with PCA:
[ 0.90909091  0.90789474  0.92105263  0.88157895  0.88157895]
It took the following time to complete this task: 0:00:00.783735


In [561]:
start = datetime.now()
#Let's pop those into the model
lr = LogisticRegression(C=1, penalty='l2')
#Fit the model
lr.fit(X1, y)
score1 = lr.score(X1, y)
Train_Score.append(score1)
lr.fit(X1_test, y_test)
score2 = lr.score(X1_test, y_test)
Test_Score.append(score2)
lr.fit(X2, y)
score3 = lr.score(X2, y)
Train_Score.append(score3)
lr.fit(X2_test, y_test)
score4 = lr.score(X2_test, y_test)
Test_Score.append(score4)
lr.fit(X3, y)
score5 = lr.score(X3, y)
Train_Score.append(score5)
lr.fit(X3_test, y_test)
score6 = lr.score(X3_test, y_test)
Test_Score.append(score6)

# Inspect the results.
print('\nR-squared for the training set with PCA:')
print(score1)
print('\nR-squared for the test set with PCA:')
print(score2)
Cross_Validation0 = cross_val_score(lr, X1, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with RFECV generated features:')
print(score3)
print('\nR-squared for the test set with RFECV generated features:')
print(score4)
Cross_Validation0 = cross_val_score(lr, X2, y, cv=5)
print('\nCross Validation Score with 5 folds with RFECV generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with Random Forest generated features:')
print(score5)
print('\nR-squared for the test set with Random Forest generated features:')
print(score6)
Cross_Validation0 = cross_val_score(lr, X3, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the training set with PCA:
0.950131233596

R-squared for the test set with PCA:
0.984042553191

Cross Validation Score with 5 folds with PCA:
[ 0.93506494  0.93421053  0.97368421  0.92105263  0.92105263]

R-squared for the training set with RFECV generated features:
0.944881889764

R-squared for the test set with RFECV generated features:
0.946808510638

Cross Validation Score with 5 folds with RFECV generated features:
[ 0.90909091  0.97368421  0.98684211  0.94736842  0.90789474]

R-squared for the training set with Random Forest generated features:
0.910761154856

R-squared for the test set with Random Forest generated features:
0.952127659574

Cross Validation Score with 5 folds with PCA:
[ 0.90909091  0.92105263  0.90789474  0.88157895  0.89473684]
It took the following time to complete this task: 0:00:00.063022


In [562]:
start = datetime.now()
neighbors = KNeighborsClassifier(n_neighbors=5)
#Fit the model
neighbors.fit(X1, y)
score1 = neighbors.score(X1, y)
Train_Score.append(score1)
neighbors.fit(X1_test, y_test)
score2 = neighbors.score(X1_test, y_test)
Test_Score.append(score2)
neighbors.fit(X2, y)
score3 = neighbors.score(X2, y)
Train_Score.append(score3)
neighbors.fit(X2_test, y_test)
score4 = neighbors.score(X2_test, y_test)
Test_Score.append(score4)
neighbors.fit(X3, y)
score5 = neighbors.score(X3, y)
Train_Score.append(score5)
neighbors.fit(X3_test, y_test)
score6 = neighbors.score(X3_test, y_test)
Test_Score.append(score6)

# Inspect the results.
print('\nR-squared for the training set with PCA:')
print(score1)
print('\nR-squared for the test set with PCA:')
print(score2)
Cross_Validation0 = cross_val_score(neighbors, X1, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with RFECV generated features:')
print(score3)
print('\nR-squared for the test set with RFECV generated features:')
print(score4)
Cross_Validation0 = cross_val_score(neighbors, X2, y, cv=5)
print('\nCross Validation Score with 5 folds with RFECV generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with Random Forest generated features:')
print(score5)
print('\nR-squared for the test set with Random Forest generated features:')
print(score6)
Cross_Validation0 = cross_val_score(neighbors, X3, y, cv=5)
print('\nCross Validation Score with 5 folds with Random Forest generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the training set with PCA:
0.92125984252

R-squared for the test set with PCA:
0.973404255319

Cross Validation Score with 5 folds with PCA:
[ 0.92207792  0.92105263  0.93421053  0.89473684  0.92105263]

R-squared for the training set with RFECV generated features:
0.926509186352

R-squared for the test set with RFECV generated features:
0.93085106383

Cross Validation Score with 5 folds with RFECV generated features:
[ 0.8961039   0.89473684  0.97368421  0.88157895  0.88157895]

R-squared for the training set with Random Forest generated features:
0.913385826772

R-squared for the test set with Random Forest generated features:
0.952127659574

Cross Validation Score with 5 folds with Random Forest generated features:
[ 0.92207792  0.90789474  0.89473684  0.89473684  0.88157895]
It took the following time to complete this task: 0:00:00.076210


In [563]:
start = datetime.now()
#Now let's model with SVM
svc = SVC()
#Fit the model
svc.fit(X1, y)
score1 = svc.score(X1, y)
Train_Score.append(score1)
svc.fit(X1_test, y_test)
score2 = svc.score(X1_test, y_test)
Test_Score.append(score2)
svc.fit(X2, y)
score3 = svc.score(X2, y)
Train_Score.append(score3)
svc.fit(X2_test, y_test)
score4 = svc.score(X2_test, y_test)
Test_Score.append(score4)
svc.fit(X3, y)
score5 = svc.score(X3, y)
Train_Score.append(score5)
svc.fit(X3_test, y_test)
score6 = svc.score(X3_test, y_test)
Test_Score.append(score6)

# Inspect the results.
print('\nR-squared for the training set with PCA:')
print(score1)
print('\nR-squared for the test set with PCA:')
print(score2)
Cross_Validation0 = cross_val_score(svc, X1, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with RFECV generated features:')
print(score3)
print('\nR-squared for the test set with RFECV generated features:')
print(score4)
Cross_Validation0 = cross_val_score(svc, X2, y, cv=5)
print('\nCross Validation Score with 5 folds with RFECV generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with Random Forest generated features:')
print(score5)
print('\nR-squared for the test set with Random Forest generated features:')
print(score6)
Cross_Validation0 = cross_val_score(svc, X3, y, cv=5)
print('\nCross Validation Score with 5 folds with Random Forest generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the training set with PCA:
1.0

R-squared for the test set with PCA:
1.0

Cross Validation Score with 5 folds with PCA:
[ 0.62337662  0.61842105  0.61842105  0.61842105  0.61842105]

R-squared for the training set with RFECV generated features:
0.994750656168

R-squared for the test set with RFECV generated features:
1.0

Cross Validation Score with 5 folds with RFECV generated features:
[ 0.63636364  0.61842105  0.64473684  0.64473684  0.64473684]

R-squared for the training set with Random Forest generated features:
0.992125984252

R-squared for the test set with Random Forest generated features:
0.994680851064

Cross Validation Score with 5 folds with Random Forest generated features:
[ 0.64935065  0.64473684  0.67105263  0.64473684  0.64473684]
It took the following time to complete this task: 0:00:00.163864


In [564]:
start = datetime.now()
#Now let's model with Random Forest
rfc = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, 
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                                      max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
                                      bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, 
                                      warm_start=False, class_weight=None)

#Fit the model
rfc.fit(X1, y)
score1 = rfc.score(X1, y)
Train_Score.append(score1)
rfc.fit(X1_test, y_test)
score2 = rfc.score(X1_test, y_test)
Test_Score.append(score2)
rfc.fit(X2, y)
score3 = rfc.score(X2, y)
Train_Score.append(score3)
rfc.fit(X2_test, y_test)
score4 = rfc.score(X2_test, y_test)
Test_Score.append(score4)
rfc.fit(X3, y)
score5 = rfc.score(X3, y)
Train_Score.append(score5)
rfc.fit(X3_test, y_test)
score6 = rfc.score(X3_test, y_test)
Test_Score.append(score6)

# Inspect the results.
print('\nR-squared for the training set with PCA:')
print(score1)
print('\nR-squared for the test set with PCA:')
print(score2)
Cross_Validation0 = cross_val_score(rfc, X1, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with RFECV generated features:')
print(score3)
print('\nR-squared for the test set with RFECV generated features:')
print(score4)
Cross_Validation0 = cross_val_score(rfc, X2, y, cv=5)
print('\nCross Validation Score with 5 folds with RFECV generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with Random Forest generated features:')
print(score5)
print('\nR-squared for the test set with Random Forest generated features:')
print(score6)
Cross_Validation0 = cross_val_score(rfc, X3, y, cv=5)
print('\nCross Validation Score with 5 folds with Random Forest generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the training set with PCA:
1.0

R-squared for the test set with PCA:
1.0

Cross Validation Score with 5 folds with PCA:
[ 0.93506494  0.90789474  0.90789474  0.92105263  0.92105263]

R-squared for the training set with RFECV generated features:
0.994750656168

R-squared for the test set with RFECV generated features:
1.0

Cross Validation Score with 5 folds with RFECV generated features:
[ 0.96103896  0.93421053  0.96052632  0.92105263  0.94736842]

R-squared for the training set with Random Forest generated features:
0.992125984252

R-squared for the test set with Random Forest generated features:
1.0

Cross Validation Score with 5 folds with Random Forest generated features:
[ 0.94805195  0.94736842  0.92105263  0.93421053  0.94736842]
It took the following time to complete this task: 0:00:00.373847


In [565]:
start = datetime.now()
#Now let's model with Gradient Boosting
clf = GradientBoostingClassifier(loss='exponential', learning_rate=0.1, n_estimators=1000, subsample=0.75, 
                                          criterion='friedman_mse', min_samples_split=4, min_samples_leaf=1, 
                                          min_weight_fraction_leaf=0.0, max_depth=5, min_impurity_decrease=0.0, 
                                          min_impurity_split=None, init=None, random_state=None, max_features=None, 
                                          verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

#Fit the model
clf.fit(X1, y)
score1 = clf.score(X1, y)
Train_Score.append(score1)
clf.fit(X1_test, y_test)
score2 = clf.score(X1_test, y_test)
Test_Score.append(score2)
clf.fit(X2, y)
score3 = clf.score(X2, y)
Train_Score.append(score3)
clf.fit(X2_test, y_test)
score4 = clf.score(X2_test, y_test)
Test_Score.append(score4)
clf.fit(X3, y)
score5 = clf.score(X3, y)
Train_Score.append(score5)
clf.fit(X3_test, y_test)
score6 = clf.score(X3_test, y_test)
Test_Score.append(score6)

# Inspect the results.
print('\nR-squared for the training set with PCA:')
print(score1)
print('\nR-squared for the test set with PCA:')
print(score2)
Cross_Validation0 = cross_val_score(clf, X1, y, cv=5)
print('\nCross Validation Score with 5 folds with PCA:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with RFECV generated features:')
print(score3)
print('\nR-squared for the test set with RFECV generated features:')
print(score4)
Cross_Validation0 = cross_val_score(clf, X2, y, cv=5)
print('\nCross Validation Score with 5 folds with RFECV generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('\nR-squared for the training set with Random Forest generated features:')
print(score5)
print('\nR-squared for the test set with Random Forest generated features:')
print(score6)
Cross_Validation0 = cross_val_score(clf, X3, y, cv=5)
print('\nCross Validation Score with 5 folds with Random Forest generated features:\n{}'.format(Cross_Validation0))
Cross_Validation.append(Cross_Validation0)
fold1.append(Cross_Validation0[0])
fold2.append(Cross_Validation0[1])
fold3.append(Cross_Validation0[2])
fold4.append(Cross_Validation0[3])
fold5.append(Cross_Validation0[4])
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the training set with PCA:
1.0

R-squared for the test set with PCA:
1.0

Cross Validation Score with 5 folds with PCA:
[ 0.98701299  0.93421053  0.94736842  0.94736842  0.89473684]

R-squared for the training set with RFECV generated features:
1.0

R-squared for the test set with RFECV generated features:
1.0

Cross Validation Score with 5 folds with RFECV generated features:
[ 0.94805195  0.93421053  0.94736842  0.96052632  0.92105263]

R-squared for the training set with Random Forest generated features:
1.0

R-squared for the test set with Random Forest generated features:
1.0

Cross Validation Score with 5 folds with Random Forest generated features:
[ 0.94805195  0.92105263  0.92105263  0.93421053  0.96052632]
It took the following time to complete this task: 0:00:09.417496


In [566]:
results['Train Score'] = Train_Score
results['Test Score'] = Test_Score
results['CV Fold 1'] = fold1
results['CV Fold 2'] = fold2
results['CV Fold 3'] = fold3
results['CV Fold 4'] = fold4
results['CV Fold 5'] = fold5
results.head(21)

Unnamed: 0,Model,Feature Selection,Train Score,Test Score,CV Fold 1,CV Fold 2,CV Fold 3,CV Fold 4,CV Fold 5
0,Logistic Regression,PCA,0.947507,0.994681,0.935065,0.934211,0.973684,0.921053,0.921053
1,Logistic Regression,RFECV,0.971129,0.968085,0.974026,0.960526,0.960526,0.973684,0.921053
2,Logistic Regression,Random Forest,0.944882,0.984043,0.948052,0.934211,0.934211,0.934211,0.921053
3,Lasso Regression,PCA,0.947507,0.978723,0.948052,0.934211,0.973684,0.934211,0.907895
4,Lasso Regression,RFECV,0.96063,0.962766,0.935065,0.973684,0.986842,0.973684,0.881579
5,Lasso Regression,Random Forest,0.926509,0.952128,0.909091,0.907895,0.921053,0.881579,0.881579
6,Ridge Regression,PCA,0.950131,0.984043,0.935065,0.934211,0.973684,0.921053,0.921053
7,Ridge Regression,RFECV,0.944882,0.946809,0.909091,0.973684,0.986842,0.947368,0.907895
8,Ridge Regression,Random Forest,0.910761,0.952128,0.909091,0.921053,0.907895,0.881579,0.894737
9,KNN,PCA,0.92126,0.973404,0.922078,0.921053,0.934211,0.894737,0.921053


In [567]:
results.describe()

Unnamed: 0,Train Score,Test Score,CV Fold 1,CV Fold 2,CV Fold 3,CV Fold 4,CV Fold 5
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,0.963755,0.979737,0.894249,0.888471,0.902882,0.884085,0.873434
std,0.032799,0.022354,0.110144,0.11123,0.111608,0.10743,0.101876
min,0.910761,0.930851,0.623377,0.618421,0.618421,0.618421,0.618421
25%,0.944882,0.962766,0.909091,0.907895,0.907895,0.881579,0.881579
50%,0.96063,0.984043,0.935065,0.934211,0.934211,0.921053,0.907895
75%,0.994751,1.0,0.948052,0.934211,0.973684,0.934211,0.921053
max,1.0,1.0,0.987013,0.973684,0.986842,0.973684,0.960526


SVC, Random Forest, and Gradient Booster all overfit the model, with SVC the worst offender with the lowest Cross Validation Scores across the board. KNN also showed signs of overfitting. 

Logistic Regression Models (including Lasso and Ridge) showed the most consistent results with PCA. Of these, Lasso performed slightly better than the other two models.