# Pipeline

In [3]:
import pandas as pd
data = pd.read_csv("../../data/processed/credit.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [25]:
x = data.iloc[:, 0:-1]
x

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,00202,0
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,00043,560
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,00280,824
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,00100,3
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,00260,0
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,00200,394
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,00200,1
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,00280,750


In [26]:
y = data.iloc[:, -1]
y = y.map({'-': 0,
       '+': 1})
y

0      1
1      1
2      1
3      1
4      1
      ..
685    0
686    0
687    0
688    0
689    0
Name: A16, Length: 690, dtype: int64

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

model_pipeline = Pipeline(steps=[
    ('preprocessing', col_trans),
    ('model', DecisionTreeClassifier())
])

In [28]:
from sklearn.model_selection import train_test_split
test_size = 0.23
random_state = 6
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=test_size, 
                                                    random_state=random_state)

In [29]:
model_pipeline.fit(x_train, y_train)

In [30]:
y_pred_model_pipeline = model_pipeline.predict(x_test)
y_pred_model_pipeline

array([0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1], dtype=int64)

In [31]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

print("Model score")
print("accuracy: ", accuracy_score(y_test, y_pred_model_pipeline))
print("recall: ", recall_score(y_test, y_pred_model_pipeline))
print("precision: ", precision_score(y_test, y_pred_model_pipeline))


Model score
accuracy:  0.8427672955974843
recall:  0.8028169014084507
precision:  0.8382352941176471
