In [1]:
import pandas as pd
import os
import joblib
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
!certifai --version
import sklearn as scikit
print(f'sklearn version is {scikit.__version__}')

Certifai version: 1.3.3
Scanner build: 1.3.3-260-g24ae833d
sklearn version is 0.23.2


In [3]:
# for reproducibility purposes; use pinned version
assert scikit.__version__ >= '0.23.1', 'scikit-learn version mismatch, `pip install scikit-learn>=0.23.1` to install right sklearn version for this notebook'
assert np.__version__     >= '1.16.2' , 'numpy version mismatch, `pip install numpy>=1.16.2` to install right numpy version for this notebook'

In [4]:
# model training preparation
base_path = '../..'
all_data_file = f"{base_path}/datasets/german_credit_eval_multiclass_encoded.csv"

df = pd.read_csv(all_data_file)

cat_columns = [
    'checkingstatus',
    'history',
    'purpose',
    'savings',
    'employ',
    'status',
    'others',
    'property',
    'age',
    'otherplans',
    'housing',
    'job',
    'telephone',
    'foreign'
    ]

label_column = 'outcome'

# Separate outcome
y = df[label_column]
X = df.drop(label_column, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
# from model_encoder import TransformerCustom
parameters = {'C': (0.5, 1.0, 2.0), 'solver': ['lbfgs'], 'max_iter': [1000]}
transformer_pipeline = Pipeline( steps = [ ( 'scaler', StandardScaler())])
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', transformer_pipeline),
                                  ( 'model', GridSearchCV(LogisticRegression(), parameters, cv=5) ) ] )
full_pipeline_m.fit( X_train, y_train)
y_pred = full_pipeline_m.predict(X_test.values)
accuracy = full_pipeline_m.score(X_test, y_test)
accuracy

0.655

In [6]:
# model persistence. dump trained model binary along with encoder to disk
# first create a model artifact name. used later for model persistence
model = full_pipeline_m
model_artifact_key = 'german_credit_multiclass'
model_dir = 'models'
if not os.path.exists(model_dir):
     os.makedirs(model_dir)
model_path = f'{model_dir}/{model_artifact_key}.joblib'
joblib.dump(model, model_path)

['models/german_credit_multiclass.joblib']