<a href="https://colab.research.google.com/github/Andrei-WongE/ML_sandbox/blob/origin/Class_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Mount Drive


In [None]:
#Import drive library
from google.colab import drive

#mount location
drive.mount('/content/drive')

Mounted at /content/drive


#Change Working directory

In [None]:
#OS from python
import os

mount = '/content/drive'
drive_root = mount + "/My Drive/Colab Notebooks/Applied ML"

# Create drive_root if it doesn't exist
create_drive_root = True
if create_drive_root:
  print("\nColab: making sure ", drive_root, " exists.")
  os.makedirs(drive_root, exist_ok=True)

# Change to the directory
print("\nColab: Changing directory to ", drive_root)
%cd $drive_root


Colab: making sure  /content/drive/My Drive/Colab Notebooks/Applied ML  exists.

Colab: Changing directory to  /content/drive/My Drive/Colab Notebooks/Applied ML
/content/drive/My Drive/Colab Notebooks/Applied ML


In [None]:
# Check wd
!pwd

/content/drive/My Drive/Colab Notebooks/Applied ML


#Feature selection

Three benefits of performing feature selection before
modelling your data are: <br>
• Reduces Overfitting: Less redundant data means less opportunity to make decisions based
on noise.<br>
• Improves Accuracy: Less misleading data means modelling accuracy improves.<br>
• Reduces Training Time: Less data means that algorithms train faster

##Univariate Selection

In [None]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

# summarize scores
set_printoptions(precision=2)
print(fit.scores_)
features = fit.transform(X)

# summarize selected features
print(features[0:5,:])

[ 111.52 1411.89   17.61   53.11 2175.57  127.67    5.39  181.3 ]
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 ...
 [121.  112.   26.2  30. ]
 [126.    0.   30.1  47. ]
 [ 93.    0.   30.4  23. ]]


##Recursive Feature Elimination

In [None]:
# Feature Extraction with RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# feature extraction
model = LogisticRegression(solver='liblinear')
rfe = RFE(model, n_features_to_select=3, step=1)
fit = rfe.fit(X, Y)

print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


##Data Reduction using Principal Component Analysis

In [None]:
# Feature Extraction with PCA
from sklearn.decomposition import PCA

# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)

# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

[[-2.02e-03  9.78e-02  1.61e-02  6.08e-02  9.93e-01  1.40e-02  5.37e-04
  -3.56e-03]
 [-2.26e-02 -9.72e-01 -1.42e-01  5.79e-02  9.46e-02 -4.70e-02 -8.17e-04
  -1.40e-01]
 [-2.25e-02  1.43e-01 -9.22e-01 -3.07e-01  2.10e-02 -1.32e-01 -6.40e-04
  -1.25e-01]]


##Feature Importance

In [None]:
# Feature Importance with Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.11 0.24 0.1  0.08 0.07 0.14 0.12 0.14]


#Resampling

#Split into Train and Test Sets
The size of the split can depend on the size and specifics of
your dataset, although it is common to use 67% of the data for training and the remaining 33% for
testing.

In [None]:
from sklearn.model_selection import train_test_split

#Set-up
test_size = 0.33
seed = 876
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)

#Model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train, Y_train)

#Result
result = model.score(X_test, Y_test)

print("Accuracy: %.3f%%" % (result*100.0))

Accuracy: 79.528%


#K-fold Cross-Valudation
The algorithm is trained on k − 1 folds with one held back and tested on the held back fold. This is repeated so that each fold of the dataset is given a chance to be the held back test set. After running cross-validation you end up with k different performance scores that you can summarise using a mean and a standard deviation

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

#Set-up
num_folds = 10
seed = 978

#Model
kfold = KFold(n_splits=num_folds, random_state=seed ,shuffle=True)
model = LogisticRegression(solver = 'liblinear')

#Result
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 76.951% (4.841%)


#Leave One Out Cross-Validation
Note: LeaveOneOut() is equivalent to KFold(n_splits=n) and LeavePOut(p=1) where n is the number of samples.

Due to the high number of test sets (which is the same as the number of samples) this cross-validation method can be very costly. For large datasets one should favor KFold, ShuffleSplit or StratifiedKFold.

In [None]:
from sklearn.model_selection import LeaveOneOut

#Set-up
loocv = LeaveOneOut()
model = LogisticRegression(solver = 'liblinear')

#Results
results = cross_val_score(model, X, Y, cv=loocv)

print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 76.823% (42.196%)


#Repeated Random Test-Train Splits

In [None]:
from sklearn.model_selection import ShuffleSplit

#Set-up
n_splits = 10
test_size = 0.33
seed = 7

#MNodel
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
model = LogisticRegression(solver = 'liblinear')

#Result
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 76.535% (1.691%)
