In [1]:
import pandas as pd
import os
import joblib
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# model training preparation
base_path = '../..'
all_data_file = f"{base_path}/datasets/german_credit_eval_multiclass_encoded.csv"

df = pd.read_csv(all_data_file)

cat_columns = [
    'checkingstatus',
    'history',
    'purpose',
    'savings',
    'employ',
    'status',
    'others',
    'property',
    'age',
    'otherplans',
    'housing',
    'job',
    'telephone',
    'foreign'
    ]

label_column = 'outcome'

# Separate outcome
y = df[label_column]
X = df.drop(label_column, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [3]:
%%writefile model_encoder.py

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from certifai.common.utils.encoding import CatEncoder
import numpy as np

class TransformerCustom(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 cat_columns,
                 normalize=True,
                 cat_column_value_names=None,
                 data_encoded=False,
                 string_equivalence=True,
                 numeric=None):
        
        self.encoder = None
        self._cat_column_value_names = cat_column_value_names
        self._normalize = normalize
        self._data_encoded = data_encoded
        self._cat_columns = cat_columns
        self._string_equivalence = string_equivalence
        self._numeric = numeric

    def fit(self, X, y=None):
        self.encoder = CatEncoder(self._cat_columns,
                                  X,self._normalize,
                                  self._cat_column_value_names,
                                  self._data_encoded,
                                  self._string_equivalence,
                                  self._numeric)
        
        return self

    def transform(self, X):
        if isinstance(X,np.ndarray):
            return self.encoder(X)
        return self.encoder(X.values)


Writing model_encoder.py


In [4]:
from model_encoder import TransformerCustom
parameters = {'C': (0.5, 1.0, 2.0), 'solver': ['lbfgs'], 'max_iter': [1000]}
transformer_pipeline = Pipeline( steps = [ ( 'transformer', TransformerCustom(cat_columns=[]))])
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', transformer_pipeline),

                                  ( 'model', GridSearchCV(LogisticRegression(), parameters, cv=3) ) ] )
full_pipeline_m.fit( X_train, y_train)
y_pred = full_pipeline_m.predict(X_test.values)
accuracy = full_pipeline_m.score(X_test, y_test)
accuracy

0.69

In [5]:
# model persistence. dump trained model binary along with encoder to disk
# first create a model artifact name. used later for model persistence
model = full_pipeline_m
model_artifact_key = 'german_credit_multiclass'
model_dir = 'models'
if not os.path.exists(model_dir):
     os.makedirs(model_dir)
model_path = f'{model_dir}/{model_artifact_key}.joblib'
joblib.dump(model, model_path)

['models/german_credit_multiclass.joblib']