In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.linear_model import LogisticRegression

# File system manangement
import os

import joblib
import requests
from time import time

In [2]:
# List files available
print(os.listdir("./input/"))

['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'sample_submission.csv']


In [3]:
# Training data
app_train = pd.read_csv('./input/application_train.csv')
app_test = pd.read_csv('./input/application_test.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

Training data shape:  (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
train_labels = app_train['TARGET']

app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (307511, 121)
Testing Features shape:  (48744, 121)


In [5]:
numeric_columns = []
category_columns = []
for col in app_train:
    if app_train[col].dtype == 'object':
        category_columns.append(col)
    else:
        numeric_columns.append(col)

In [6]:
numeric_transformer = Pipeline(steps=[
    ('impute',Imputer(strategy='median')),
    ('scaler',StandardScaler()
    )
])

category_transformer = Pipeline(steps=[
    ('impute',Imputer(strategy='constant',fill_value='missing')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])

preprocesser = ColumnTransformer(transformers=[
    ('numeric',numeric_transformer,numeric_columns),
    ('category',category_transformer,category_columns)
])


In [7]:
# 使用sklearn的pipelines训练一个模型

log_reg = LogisticRegression(C = 0.0001)

pipeline = Pipeline(steps=[
    ('preprocesser', preprocesser),
    ('log_reg', log_reg)
])
pipeline.fit(app_train, train_labels)

Pipeline(steps=[('preprocesser',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['SK_ID_CURR', 'CNT_CHILDREN',
                                                   'AMT_INCOME_TOTAL',
                                                   'AMT_CREDIT', 'AMT_ANNUITY',
                                                   'AMT_GOODS_PRICE',
                                                   'REGION_POPULATION_RELATIVE',
                                                   'DAYS_BIRTH',
                                                   'DAYS_EMPLOYED',
                                              

In [8]:
joblib.dump(pipeline, './docker/sklearn_model.joblib')

['./docker/sklearn_model.joblib']

In [9]:
!docker build --tag project ./docker/

#2 [internal] load build definition from Dockerfile
#2 transferring dockerfile: 32B done
#2 DONE 0.0s

#1 [internal] load .dockerignore
#1 transferring context: 2B done
#1 DONE 0.0s

#3 [internal] load metadata for docker.io/library/python:3.7-slim
#3 DONE 3.5s

#4 [1/6] FROM docker.io/library/python:3.7-slim@sha256:65de7b645c38c512c3e2...
#4 DONE 0.0s

#7 [internal] load build context
#7 transferring context: 24.56kB done
#7 DONE 0.1s

#6 [3/6] RUN pip install -i http://pypi.douban.com/simple/ --trusted-host=p...
#6 CACHED

#8 [4/6] COPY app.py .
#8 CACHED

#9 [5/6] COPY sklearn_model.pickle .
#9 CACHED

#5 [2/6] WORKDIR /usr/src/app
#5 CACHED

#10 [6/6] COPY sklearn_model.joblib .
#10 CACHED

#11 exporting to image
#11 exporting layers done
#11 writing image sha256:7285921336b2fa83edb2bee2203e21d750bc7c4e2c802d239038be82257f63a3 done
#11 naming to docker.io/library/project done
#11 DONE 0.0s


In [10]:
!docker run --name project_test -p 8080:8080 -d project

9fb9af54f1aae56c8bbb157c63581f4211c1a1d5fe48c38100696052fdb04eeb


In [12]:
# 测试性能
start = time()
app_test = app_test[:1000]
# for _, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
for _, row in app_test.iterrows():
    item = row.to_dict()
    res = requests.post('http://localhost:8080',json = item) ## json 为 dict
    print(res.text)
end = time()
'Used', end-start, 'seconds'

{"predict":0.943288588760659}

{"predict":0.849130012207588}

{"predict":0.9587926927194563}

{"predict":0.9565269003869907}

{"predict":0.8618659445891443}

{"predict":0.9557973205602237}

{"predict":0.9575213771704587}

{"predict":0.9043963531094006}

{"predict":0.9728694348879142}

{"predict":0.8465245919723765}

{"predict":0.9312044524238939}

{"predict":0.9078466600404103}

{"predict":0.8651853084394927}

{"predict":0.8725078629213989}

{"predict":0.9087002582073126}

{"predict":0.8980610342898866}

{"predict":0.9149288283127897}

{"predict":0.9609107727455217}

{"predict":0.9075313268921259}

{"predict":0.94733270303698}

{"predict":0.9643455270143779}

{"predict":0.9771563460348374}

{"predict":0.9662766424091817}

{"predict":0.9372181091880096}

{"predict":0.9515028073058066}

{"predict":0.8752548789353427}

{"predict":0.9047274969679375}

{"predict":0.9348449009836186}

{"predict":0.9474752976536803}

{"predict":0.9058484218716427}

{"predict":0.9672326338206169}

{"predict":0

{"predict":0.965294756423321}

{"predict":0.9272180630955329}

{"predict":0.9618582501932241}

{"predict":0.9810181065917198}

{"predict":0.8946016722630908}

{"predict":0.9048992341220032}

{"predict":0.9800110706274484}

{"predict":0.8609083449227882}

{"predict":0.8896015798249867}

{"predict":0.9613186603085153}

{"predict":0.96521096432686}

{"predict":0.9155772824540835}

{"predict":0.9664735997929373}

{"predict":0.9010813975536165}

{"predict":0.7481187650699274}

{"predict":0.9459643328626435}

{"predict":0.943188702777191}

{"predict":0.9574839867416047}

{"predict":0.8448623621450808}

{"predict":0.9450066375916505}

{"predict":0.9486523504000168}

{"predict":0.8659053782556867}

{"predict":0.9100058634721183}

{"predict":0.9505900061976997}

{"predict":0.7481754660406988}

{"predict":0.9751795233160904}

{"predict":0.8597684150942446}

{"predict":0.9769470538968962}

{"predict":0.8466603664261044}

{"predict":0.9768369655642299}

{"predict":0.9641544342206475}

{"predict":0

{"predict":0.9619149659929298}

{"predict":0.9369426133269232}

{"predict":0.9504382392436113}

{"predict":0.9274819435274702}

{"predict":0.9193348987916362}

{"predict":0.9446794440880667}

{"predict":0.9710624753441405}

{"predict":0.9333184472719177}

{"predict":0.8756260072862534}

{"predict":0.9717181629161336}

{"predict":0.9297084241613732}

{"predict":0.8462895342313903}

{"predict":0.977087601294427}

{"predict":0.9308780182475048}

{"predict":0.9222539264803836}

{"predict":0.9723514184537055}

{"predict":0.9243857732129463}

{"predict":0.8843482901929973}

{"predict":0.9311137546519735}

{"predict":0.7563867325578781}

{"predict":0.952573828034229}

{"predict":0.9679674292967835}

{"predict":0.7817524818330956}

{"predict":0.9707672219717102}

{"predict":0.8917053040612871}

{"predict":0.8578692358342025}

{"predict":0.9223763063620741}

{"predict":0.9790459357609841}

{"predict":0.8109639080706766}

{"predict":0.8169331714442732}

{"predict":0.9773319730135788}

{"predict"

{"predict":0.8468590808918675}

{"predict":0.9057110635013312}

{"predict":0.8973447932455028}

{"predict":0.9539681904107633}

{"predict":0.9565202306418036}

{"predict":0.9148682595608231}

{"predict":0.9173599861046347}

{"predict":0.8884300532987462}

{"predict":0.9313314782630521}

{"predict":0.977529846132442}

{"predict":0.7980851439898894}

{"predict":0.9531781341229536}

{"predict":0.9234622184940284}

{"predict":0.917879410678432}

{"predict":0.9798317734044452}

{"predict":0.9588077758713529}

{"predict":0.9808062350145088}

{"predict":0.7581202111523608}

{"predict":0.9243872162095577}

{"predict":0.9018059234227283}

{"predict":0.9555485993542461}

{"predict":0.9403699097901975}

{"predict":0.9257162290796035}

{"predict":0.8949452974850792}

{"predict":0.9704856324447488}

{"predict":0.9680445189994286}

{"predict":0.9636145278471802}

{"predict":0.8772429643361711}

{"predict":0.8834299344665912}

{"predict":0.9184449083225488}

{"predict":0.9592203422183928}

{"predict"

('Used', 19.39782190322876, 'seconds')