In [1]:
# from pandas import read_csv
import pandas as pd
filename = '../Week 1/pima-indians-diabetes.data.csv'
names = ['timesPreg', 'plasmaConcenTest', 'bloodPres', 'skinFoldThickness',
         '2HSerumInsulin', 'BMI', 'diabetesPedigree', 'age', 'onsetOfDiabetesbtw5yrs']
# data = read_csv(filename, names=names)
df = pd.read_csv(filename, names=names)
df.head(2) 

Unnamed: 0,timesPreg,plasmaConcenTest,bloodPres,skinFoldThickness,2HSerumInsulin,BMI,diabetesPedigree,age,onsetOfDiabetesbtw5yrs
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [2]:
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np

# Feature selection using SelectKBest

# load data
array = df.values
X = array[:,0:8]
Y = array[:,8]

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

# summarise scores
np.set_printoptions(precision=3)
# print(fit.scores_)
features = fit.transform(X)

# summarize selected features
print(features[0:5,:])


[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


In [3]:
print("Shape before", X.shape[1])
print("Shape after", features.shape[1])

Shape before 8
Shape after 4


In [4]:
# original column names
names = ['timesPreg', 'plasmaConcenTest', 'bloodPres', 'skinFoldThickness',
         '2HSerumInsulin', 'BMI', 'diabetesPedigree', 'age', 'onsetOfDiabetesbtw5yrs']

# obtain selected feature names
selected_indices = fit.get_support(indices=True)
selected_feature_names = [names[i] for i in selected_indices]
print("Selected feature names:", selected_feature_names)

Selected feature names: ['plasmaConcenTest', '2HSerumInsulin', 'BMI', 'age']


In [5]:
# get dataframe column anmes
array = df.columns
X_names = array[0:8]

# dataframe showing selected features
kept_features = pd.DataFrame({'columns': X_names,
                              'Kept': fit.get_support()})
kept_features

Unnamed: 0,columns,Kept
0,timesPreg,False
1,plasmaConcenTest,True
2,bloodPres,False
3,skinFoldThickness,False
4,2HSerumInsulin,True
5,BMI,True
6,diabetesPedigree,False
7,age,True


In [6]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Feature selection using RFE (Recursive Feature Elimination)

array = df.values
X = array[:,0:8]
Y = array[:,8]

model = LogisticRegression(solver='liblinear')
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


In [7]:
# get dataframe column anmes
array = df.columns
X_names = array[0:8]

# dataframe showing selected features
kept_features = pd.DataFrame({'columns': X_names,
                              'Kept': fit.support_,
                              'Feature rank': fit.ranking_})
kept_features

Unnamed: 0,columns,Kept,Feature rank
0,timesPreg,True,1
1,plasmaConcenTest,False,2
2,bloodPres,False,3
3,skinFoldThickness,False,5
4,2HSerumInsulin,False,6
5,BMI,True,1
6,diabetesPedigree,True,1
7,age,False,4


In [8]:
# Data Reduction using PCA
from sklearn.decomposition import PCA

array = df.values
X = array[:,0:8]
Y = array[:,8]

pca = PCA(n_components=3)
fit = pca.fit(X)

print("Explained Variance: %s" % fit.explained_variance_ratio_) 
print(fit.components_)

Explained Variance: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


In [9]:
# Feature Importance
from sklearn.ensemble import ExtraTreesClassifier

array = df.values
X = array[:, 0:8]
Y = array[:, 8]

model = ExtraTreesClassifier().fit(X, Y)
print(model.feature_importances_)

[0.112 0.235 0.098 0.079 0.075 0.14  0.117 0.144]


In [10]:
sum(model.feature_importances_)

1.0

In [11]:
# Train and test sets

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

array = df.values
X = array[:, 0:8]
Y = array[:, 8]

test_size = 0.33
seed = 7

X_train, X_text, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
result = model.score(X_text, Y_test)
print("Accuracy: %.3f%%" % (result*100.0))

Accuracy: 75.591%


In [12]:
# K-fold Cross Validation

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

array = df.values
X = array[:, 0:8]
Y = array[:, 8]

num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" %
      (results.mean()*100.0, results.std()*100.0))

Accuracy: 77.086% (5.091%)


In [13]:
# Leave one out cross validation

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

array = df.values
X = array[:, 0:8]
Y = array[:, 8]

loocv = LeaveOneOut()
model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=loocv)
print("Accuracy: %.3f%% (%.3f%%)" %
      (results.mean()*100.0, results.std()*100.0))

Accuracy: 76.823% (42.196%)


In [14]:
# Repeated Random test train splits

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


n_splits = 10
test_size = 0.33
seed = 7
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed) 
model = LogisticRegression(solver = 'liblinear')
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 76.496% (1.698%)
