<h2>Creating model for <i>risk default</i> probability assessment

In [1]:
import pandas as pd

df_data = pd.read_csv('/project_data/data_asset/german_credit_data_complete.csv')
df_data.head()

Unnamed: 0,CheckingStatus,LoanDuration,CreditHistory,LoanPurpose,LoanAmount,ExistingSavings,EmploymentDuration,InstallmentPercent,Sex,OthersOnLoan,...,OwnsProperty,Age,InstallmentPlans,Housing,ExistingCreditsCount,Job,Dependents,Telephone,ForeignWorker,Risk
0,0_to_200,31,credits_paid_to_date,other,1889,100_to_500,less_1,3,female,none,...,savings_insurance,32,none,own,1,skilled,1,none,yes,No Risk
1,less_0,18,credits_paid_to_date,car_new,462,less_100,1_to_4,2,female,none,...,savings_insurance,37,stores,own,2,skilled,1,none,yes,No Risk
2,less_0,15,prior_payments_delayed,furniture,250,less_100,1_to_4,2,male,none,...,real_estate,28,none,own,2,skilled,1,yes,no,No Risk
3,0_to_200,28,credits_paid_to_date,retraining,3693,less_100,greater_7,3,male,none,...,savings_insurance,32,none,own,1,skilled,1,none,yes,No Risk
4,no_checking,28,prior_payments_delayed,education,6235,500_to_1000,greater_7,3,male,none,...,unknown,57,none,own,2,skilled,1,none,yes,Risk


<h4>Explore data</h4>

In [2]:
print("Number of records: " + str((df_data.shape[0])))

Number of records: 5000


<h4> Create a model </h4>

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
df_data["CheckingStatus"] = LabelEncoder().fit_transform(df_data["CheckingStatus"])
df_data["CreditHistory"] = LabelEncoder().fit_transform(df_data["CreditHistory"])
df_data["LoanPurpose"] = LabelEncoder().fit_transform(df_data["LoanPurpose"])
df_data["ExistingSavings"] = LabelEncoder().fit_transform(df_data["ExistingSavings"])
df_data["EmploymentDuration"] = LabelEncoder().fit_transform(df_data["EmploymentDuration"])
df_data["Sex"] = LabelEncoder().fit_transform(df_data["Sex"])
df_data["OthersOnLoan"] = LabelEncoder().fit_transform(df_data["OthersOnLoan"])
df_data["OwnsProperty"] = LabelEncoder().fit_transform(df_data["OwnsProperty"])
df_data["InstallmentPlans"] = LabelEncoder().fit_transform(df_data["InstallmentPlans"])
df_data["Housing"] = LabelEncoder().fit_transform(df_data["Housing"])
df_data["Job"] = LabelEncoder().fit_transform(df_data["Job"])
df_data["Telephone"] = LabelEncoder().fit_transform(df_data["Telephone"])
df_data["ForeignWorker"] = LabelEncoder().fit_transform(df_data["ForeignWorker"])
df_data["Risk"] = LabelEncoder().fit_transform(df_data["Risk"])

In [5]:
(train_data, test_data) = train_test_split(df_data, test_size = 0.2, stratify = df_data["Risk"])

print("Number of records for training: " + str(train_data.shape[0]))
print("Number of records for evaluation: " + str(test_data.shape[0]))

Number of records for training: 4000
Number of records for evaluation: 1000


In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(train_data.loc[:, train_data.columns != "Risk"], train_data["Risk"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
from sklearn.metrics import roc_auc_score

predictions = rf.predict(test_data.loc[:,test_data.columns != "Risk"])
area_under_curve = roc_auc_score(test_data["Risk"], predictions)

print("areaUnderROC = %g" % area_under_curve)

areaUnderROC = 0.727424


<h4>Publish the model as asset</h4>

In [9]:
!pip install watson-machine-learning-client-V4



In [41]:
import sys,os,os.path
token = os.environ['USER_ACCESS_TOKEN']

wml_credentials = {
    "token": token,
    "instance_id" : "openshift",
    "url": os.environ['RUNTIME_ENV_APSX_URL'],
    "version": "2.5.0"
}

In [42]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient

wml_client = WatsonMachineLearningAPIClient( wml_credentials )

In [43]:
MODEL_NAME = "[DEMO] Credit Risk RF"

In [44]:
def guid_from_space_name(client, space_name):
    instance_details = client.service_instance.get_details()
    space = client.spaces.get_details()
    return(next(item for item in space['resources'] if item['entity']["name"] == space_name)['metadata']['guid'])

In [54]:
dep_spacename = 'wsl_jupyterlab_demo'
space_uid = guid_from_space_name(wml_client, dep_spacename)
print("Space UID for '" + dep_spacename + "' is: " + space_uid)

Space UID for 'wsl_jupyterlab_demo' is: c83c7529-c9d0-439e-99aa-01e6ca33e6ee


In [51]:
meta_props={
    wml_client.repository.ModelMetaNames.NAME: "Credit Risk Scikit",
    wml_client.repository.ModelMetaNames.RUNTIME_UID: "scikit-learn_0.20-py3.6",
    wml_client.repository.ModelMetaNames.TYPE: "scikit-learn_0.20",     
    wml_client.repository.ModelMetaNames.SPACE_UID: space_uid
}

In [55]:
wml_client.set.default_space(space_uid)

'SUCCESS'

In [56]:
model_artifact = wml_client.repository.store_model(rf,
                                               meta_props=meta_props,
                                               training_data=train_data.loc[:, train_data.columns != "Risk"],
                                               training_target=train_data["Risk"])
model_uid = wml_client.repository.get_model_uid(model_artifact)
print("Model UID = " + model_uid)

Model UID = 7882f4ea-b6a7-4fe2-b362-8a1c8950469e


In [58]:
model_details = wml_client.repository.get_details(model_uid)
from pprint import pprint
pprint(model_details)

{'entity': {'content_status': {'state': 'persisted'},
            'label_column': 'Risk',
            'name': 'Credit Risk Scikit',
            'runtime': {'href': '/v4/runtimes/scikit-learn_0.20-py3.6'},
            'space': {'href': '/v4/spaces/c83c7529-c9d0-439e-99aa-01e6ca33e6ee'},
            'training_data_references': [{'connection': {'access_key_id': 'not_applicable',
                                                         'endpoint_url': 'not_applicable',
                                                         'secret_access_key': 'not_applicable'},
                                          'location': {'bucket': 'not_applicable'},
                                          'schema': {'fields': [{'name': 'CheckingStatus',
                                                                 'type': 'int64'},
                                                                {'name': 'LoanDuration',
                                                                 'type': 'int64'},
   