# Chapter 14

# Automate Machine Learning Workflows with Pipelines

## Data Preparation and Modeling Pipeline

In [1]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
num_folds = 10
seed = 7
estimator = []
estimator.append(('standardize', StandardScaler())) # standardize the data
estimator.append(('lda', LinearDiscriminantAnalysis())) # create the model
model = Pipeline(estimator) # create the pipeline
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True) # create k-fold cross-validation
results = cross_val_score(model, X, Y, cv=kfold) # evaluate the model
print(results.mean()) # print the mean accuracy

0.7669685577580315


## Feature Extraction and Modeling Pipeline

In [2]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
num_folds = 10
seed = 7
features = []
features.append(('pca', PCA(n_components=3))) # create the PCA feature
features.append(('select_best', SelectKBest(k=6))) # create the SelectKBest feature
feathure_union = FeatureUnion(features) # combine the features
estimators = []
estimators.append(('feathure_union', feathure_union)) # standardize the data
estimators.append(('logistic', LogisticRegression(solver='liblinear'))) # create the model
model = Pipeline(estimators) # create the pipeline
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True) # create k-fold cross-validation
results = cross_val_score(model, X, Y, cv=kfold) # evaluate the model
print(results.mean()) # print the mean accuracy

0.7721633629528366
