# Chapter 14

# Automate Machine Learning
# Workflows with Pipelines

14.2 Data Preparation and Modeling Pipeline

The example below demonstrates this important data preparation and model evaluation work flow on thePima Indians onset of diabetes dataset. The pipeline is de ned with two steps:

1.Standardize the data

2.Learn a Linear Discriminant Analysis model

The pipeline is then evaluated using 10-fold cross validation

In [59]:
import warnings
warnings.filterwarnings("ignore")

In [60]:
 # Create a pipeline that standardizes the data then creates a model
 from pandas import read_csv
 from sklearn.model_selection import KFold
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 # load data
 filename = 'pima-indians-diabetes.csv'
 names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 dataframe = read_csv(filename, names=names)
 array = dataframe.values
 X = array[:,0:8]
 Y = array[:,8]
 # create pipeline
 estimators = []
 estimators.append(('standardize', StandardScaler()))
 estimators.append(('lda', LinearDiscriminantAnalysis()))
 model = Pipeline(estimators)
 # evaluate pipeline
 kfold = KFold(n_splits=10,random_state=7,shuffle=True)
 results = cross_val_score(model, X, Y, cv=kfold)
 print(results.mean()*100)

76.69685577580316


In [61]:
 from pandas import read_csv
 from sklearn.model_selection import KFold
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 # load data
 filename = 'pima-indians-diabetes.csv'
 names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 dataframe = read_csv(filename, names=names)
 array = dataframe.values
 X = array[:,0:8]
 Y = array[:,8]
 # create pipeline
 estimators = []
 estimators.append(('standardize', StandardScaler()))
 estimators.append(('lda', LinearDiscriminantAnalysis()))
 model = Pipeline(estimators)
 # evaluate pipeline
 kfold = KFold(n_splits=10)
 results = cross_val_score(model, X, Y, cv=kfold)
 print(results.mean()*100)

77.3462064251538


In [62]:
 # Create a pipeline that extracts features from the data then creates a model
 from pandas import read_csv
 from sklearn.model_selection import KFold
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import FeatureUnion
 from sklearn.linear_model import LogisticRegression
 from sklearn.decomposition import PCA
 from sklearn.feature_selection import SelectKBest
 import warnings
 warnings.filterwarnings("ignore")
 # load data
 filename = 'pima-indians-diabetes.csv'
 names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 dataframe = read_csv(filename, names=names)
 array = dataframe.values
 X = array[:,0:8]
 Y = array[:,8]
 # create feature union
 features = []
 features.append(('pca', PCA(n_components=2)))
 features.append(('select_best', SelectKBest(k=7)))
 feature_union = FeatureUnion(features)
 # create pipeline
 estimators = []
 estimators.append(('feature_union', feature_union))
 estimators.append(('logistic', LogisticRegression()))
 model = Pipeline(estimators)
 # evaluate pipeline
 kfold = KFold(n_splits=10)
 results = cross_val_score(model, X, Y, cv=kfold)
 print(results.mean()*100)

77.3444976076555


**Using Regression Algorithm**

In [63]:
import pandas as pd

In [64]:
df=pd.read_csv('HousingData (Boston housing dataset).csv')

In [65]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [66]:
df.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [67]:
columns=['CRIM','ZN','INDUS','CHAS','AGE','LSTAT']

In [69]:
for i in columns:
    df[i].fillna(df[i].mean(),inplace=True)

In [73]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [70]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.200000,4.0900,1,296,15.3,396.90,4.980000,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.900000,4.9671,2,242,17.8,396.90,9.140000,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.100000,4.9671,2,242,17.8,392.83,4.030000,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.800000,6.0622,3,222,18.7,394.63,2.940000,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.200000,6.0622,3,222,18.7,396.90,12.715432,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.100000,2.4786,1,273,21.0,391.99,12.715432,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.700000,2.2875,1,273,21.0,396.90,9.080000,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.000000,2.1675,1,273,21.0,396.90,5.640000,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.300000,2.3889,1,273,21.0,393.45,6.480000,22.0


In [71]:
 # Create a pipeline that standardizes the data then creates a model
 from pandas import read_csv
 from sklearn.model_selection import KFold
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import LinearRegression
 # load data
 #filename = 'housing.csv'
 #names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX']
 #dataframe = read_csv(filename, delim_whitespace=True, names=names)
 array = df.values
 X = array[:,1:]
 Y = array[:,0]
 # create pipeline
 estimators = []
 estimators.append(('standardize', StandardScaler()))
 estimators.append(('LR', LinearRegression()))
 model = Pipeline(estimators)
 # evaluate pipeline
 kfold = KFold(n_splits=10)
 scoring = 'neg_mean_squared_error'
 results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
 print(results.mean())

-55.801217713907725


# 14.3 Feature Extraction and Modeling Pipeline

1. Feature Extraction with Principal Component Analysis (3 features).

2. Feature Extraction with Statistical Selection (6 features).

3. Feature Union.

  4.Learn a Logistic Regression Model.

In [72]:
 # Create a pipeline that extracts features from the data then creates a model
 from pandas import read_csv
 from sklearn.model_selection import KFold
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import FeatureUnion
 from sklearn.linear_model import LogisticRegression
 from sklearn.decomposition import PCA
 from sklearn.feature_selection import SelectKBest
 # load data
 filename = 'pima-indians-diabetes.csv'
 names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
 dataframe = read_csv(filename, names=names)
 array = dataframe.values
 X = array[:,0:8]
 Y = array[:,8]
 # create feature union
 features = []
 features.append(('pca', PCA(n_components=3)))
 features.append(('select_best', SelectKBest(k=6)))
 feature_union = FeatureUnion(features)
 # create pipeline
 estimators = []
 estimators.append(('feature_union', feature_union))
 estimators.append(('logistic', LogisticRegression()))
 model = Pipeline(estimators)
 # evaluate pipeline
 kfold = KFold(n_splits=10, random_state=7,shuffle=True)
 results = cross_val_score(model, X, Y, cv=kfold)
 print(results.mean()*100)

77.08646616541353


**Using Regression Algorithm**

In [74]:
 # Create a pipeline that extracts features from the data then creates a model
 from pandas import read_csv
 from sklearn.model_selection import KFold
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import FeatureUnion
 from sklearn.linear_model import LinearRegression
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.decomposition import PCA
 from sklearn.feature_selection import SelectKBest
 # load data
 array = df.values
 X = array[:,1:]
 Y = array[:,0]
 # create feature union
 features = []
 features.append(('pca', PCA(n_components=3)))
 features.append(('select_best', SelectKBest(k=6)))
 feature_union = FeatureUnion(features)
 # create pipeline
 estimators = []
 estimators.append(('feature_union', feature_union))
 estimators.append(('LR', KNeighborsRegressor()))
 model = Pipeline(estimators)
 # evaluate pipeline
 scoring = 'neg_mean_squared_error'
 results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
 print(results.mean())

-50.837219290340336
