## Benefits of Scikit-learn pipelines

- They make your workflow much easier to read and understand.
- They enforce the implementation and order of steps in your project.
- These in turn make your work much more reproducible.

In [28]:
# import packages
import numpy as np 
import pandas as pd 
import sklearn 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score 
import warnings 
warnings.filterwarnings("ignore")

## Load Loan Dataset

In [29]:
# load train dataset 
train = pd.read_csv("data/loan/train.txt", sep=",")

# load test dataset 
test = pd.read_csv("data/loan/test.txt", sep=",")

In [30]:
# drop loan_ID column
train = train.drop('Loan_ID', axis=1) 

In [31]:
# show column dtypes
train.dtypes 

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In the dataset we have both categorical and numeric variables.One hot enconding transformation and some sort of scaler will be applied.

In [32]:
#split the data intro train and test set 
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status'] 

X_train, X_test ,y_train, y_test = train_test_split(X,y , test_size = 0.2, random_state=42)

## Create Transformers 

In [33]:
# create numeric transformer 

numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# create categorical stransformer 

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])



## Apply Transformation by using ColumnTransformer  

In [34]:
# store a list of numeric columns 

numeric_features = train.select_dtypes(include=['int64','float64']).columns


# store a list of categorical columns 
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns


preprocessor = ColumnTransformer(
transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat',categorical_transformer, categorical_features)
])

## Fitting the Classifier 

In [35]:
# create a pipeline which combines the preprocessor created with a classifier 

pipe = Pipeline(steps=[('preprocessor',preprocessor),
                      ('classifier',RandomForestClassifier())])

In [36]:
# call the fit method 
pipe.fit(X_train,y_train) 

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [37]:
# predict on new data 
y_preds = pipe.predict(X_test) 

In [38]:
#print accurac score 
print("Accuracy schore: {:.3f}".format(accuracy_score(y_test,y_preds)))

Accuracy schore: 0.764


## Model Selection 

In [39]:
# import classifiers 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [40]:
# call classifiers 

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

In [41]:
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score:{:.3f}".format(pipe.score(X_test, y_test)))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
model score:0.748
SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
model score:0.650
NuSVC(cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
   kernel='rbf', max_iter=-1, nu=0.5, probability=True, random_state=None,
   shrinking=True, tol=0.001, verbose=False)
model score:0.797
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False,