In [1]:
import os, pandas as pd
# Add asset from file system
df = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/TON_PREV_NEW.csv')
df.head()

Unnamed: 0,BASKET_SIZE,EXTN_COMPOSITION,CARRIER_SERVICE_CODE_OL,CATEGORY,COUNTRY_OF_ORIGIN_OI,DAY_OF_MONTH,DAY_OF_WEEK,DAY_OF_YEAR,EXTN_BRAND,EXTN_DISCOUNT_ID,...,OTHER_CHARGES,OTHER_CHARGES_OL,REQ_DELIVERY_DATE,TOTAL_AMOUNT_USD,WEEKEND,ZIP_CODE,MTS_CTS,HOUR_OF_DAY,LOCKID,RETURN_FLAG
0,1,,STANDARD,Slip,CN,14,Saturday,287,XYZAA,,...,0.0,0.0,0,0.0,1,Zipcode_261,1,16,26,0
1,1,,STANDARD,Slip,CN,17,Tuesday,290,XYZAA,,...,0.0,0.0,0,0.0,0,Zipcode_165,2,16,36,0
2,1,"85% Polyamide, 15% Elastane",PREMIER_EVENING,Slip,CN,19,Thursday,292,XYZAA,,...,25.0,0.0,0,40.0,0,Zipcode_599,11,17,215,1
3,1,"54% Polyamide, 46% Polyester",STANDARD,Slip,CN,24,Tuesday,297,XYZAA,,...,0.0,0.0,0,0.0,0,Zipcode_261,1,15,25,0
4,2,"93% Cotton, 7% Elastane",STANDARD,Maniche Lunghe,PT,30,Monday,303,XYZAB,,...,13.0,0.0,0,251.192578,0,Zipcode_228,12,13,179,0


## Lets fill all NA(s) and empty values with 0

In [2]:
df=df.fillna(0)

In [3]:
df.dtypes

BASKET_SIZE                  int64
EXTN_COMPOSITION            object
CARRIER_SERVICE_CODE_OL     object
CATEGORY                    object
COUNTRY_OF_ORIGIN_OI        object
DAY_OF_MONTH                 int64
DAY_OF_WEEK                 object
DAY_OF_YEAR                  int64
EXTN_BRAND                  object
EXTN_DISCOUNT_ID            object
EXTN_IS_GIFT                object
EXTN_IS_PREORDER            object
EXTN_SHIP_TO_CITY           object
EXTN_SHIP_TO_COUNTRY        object
EXTN_SEASON                 object
LIST_PRICE                   int64
MONTH_OF_YEAR                int64
OTHER_CHARGES              float64
OTHER_CHARGES_OL           float64
REQ_DELIVERY_DATE            int64
TOTAL_AMOUNT_USD           float64
WEEKEND                      int64
ZIP_CODE                    object
MTS_CTS                      int64
HOUR_OF_DAY                  int64
LOCKID                       int64
RETURN_FLAG                  int64
dtype: object

## We can see that we have object columns which need to be converted to category codes to be fed into the model

In [4]:
qual = list( df.loc[:,df.dtypes == 'object'].columns.values )
for col in qual:
     df[col] = df[col].astype('category')
quant = list( df.loc[:,df.dtypes != 'category'].columns.values )
print(qual,quant)

['EXTN_COMPOSITION', 'CARRIER_SERVICE_CODE_OL', 'CATEGORY', 'COUNTRY_OF_ORIGIN_OI', 'DAY_OF_WEEK', 'EXTN_BRAND', 'EXTN_DISCOUNT_ID', 'EXTN_IS_GIFT', 'EXTN_IS_PREORDER', 'EXTN_SHIP_TO_CITY', 'EXTN_SHIP_TO_COUNTRY', 'EXTN_SEASON', 'ZIP_CODE'] ['BASKET_SIZE', 'DAY_OF_MONTH', 'DAY_OF_YEAR', 'LIST_PRICE', 'MONTH_OF_YEAR', 'OTHER_CHARGES', 'OTHER_CHARGES_OL', 'REQ_DELIVERY_DATE', 'TOTAL_AMOUNT_USD', 'WEEKEND', 'MTS_CTS', 'HOUR_OF_DAY', 'LOCKID', 'RETURN_FLAG']


In [5]:
cats = list( df.loc[:,df.dtypes == 'category'].columns.values)
categories={}
for col in cats:
    categories[col]= dict(enumerate(df[col].cat.categories))

In [6]:
df.dtypes

BASKET_SIZE                   int64
EXTN_COMPOSITION           category
CARRIER_SERVICE_CODE_OL    category
CATEGORY                   category
COUNTRY_OF_ORIGIN_OI       category
DAY_OF_MONTH                  int64
DAY_OF_WEEK                category
DAY_OF_YEAR                   int64
EXTN_BRAND                 category
EXTN_DISCOUNT_ID           category
EXTN_IS_GIFT               category
EXTN_IS_PREORDER           category
EXTN_SHIP_TO_CITY          category
EXTN_SHIP_TO_COUNTRY       category
EXTN_SEASON                category
LIST_PRICE                    int64
MONTH_OF_YEAR                 int64
OTHER_CHARGES               float64
OTHER_CHARGES_OL            float64
REQ_DELIVERY_DATE             int64
TOTAL_AMOUNT_USD            float64
WEEKEND                       int64
ZIP_CODE                   category
MTS_CTS                       int64
HOUR_OF_DAY                   int64
LOCKID                        int64
RETURN_FLAG                   int64
dtype: object

In [7]:
df["RETURN_FLAG"].value_counts()

0    128487
1     24287
Name: RETURN_FLAG, dtype: int64

## Here we can see that there are 24K orders that have been returned and 128K orders that havent been returned

In [8]:
from sklearn.model_selection import train_test_split
X=(df.drop(["RETURN_FLAG"], axis=1))
y=df['RETURN_FLAG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=42)

##  Install Custom Modules for the Pipeline Transformations

In [9]:
!pip install --upgrade CustTrans-0.2.zip

Processing ./CustTrans-0.2.zip
Building wheels for collected packages: CustTrans
  Building wheel for CustTrans (setup.py) ... [?25ldone
[?25h  Created wheel for CustTrans: filename=CustTrans-0.1-cp36-none-any.whl size=1791 sha256=ed99f86c4890a45c6c6e8cc157991f7402c410e9f2a3f62fbf2b8a4d695d280e
  Stored in directory: /user-home/999/.cache/pip/wheels/a0/88/29/9e0ddf3e5399d7652900e3dea3deb8279c74abb34474a33b84
Successfully built CustTrans
Installing collected packages: CustTrans
  Found existing installation: CustTrans 0.1
    Uninstalling CustTrans-0.1:
      Successfully uninstalled CustTrans-0.1
Successfully installed CustTrans-0.1


In [10]:
from CustomTransformer.CustTrans import TypeSelector,StringIndexer,ConvToCategorical

In [11]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn_pandas import DataFrameMapper


transformer = Pipeline([
   ('features', FeatureUnion(n_jobs=1, transformer_list=[
       # Part 1
       ('boolean', Pipeline([
           ('selector', TypeSelector('bool')),
       ])),  # booleans close

       ('numericals', Pipeline([
           ('selector', TypeSelector(np.number)),
           ('scaler', StandardScaler()),
       ])),
       # Part 2
       ('categoricals', Pipeline([
           ('convertor', ConvToCategorical()),
           ('selector', TypeSelector('category')),
           ('labeler', StringIndexer()),
           ('encoder', OneHotEncoder(handle_unknown='ignore')),
       ]))
       # categoricals close
   ])),  # features close
   ('clf' , RandomForestClassifier(n_estimators=30,criterion="entropy")),
    
])

  from numpy.core.umath_tests import inner1d


## Lets now pass the data through the transformer(fit)

In [12]:
import timeit
start_time = timeit.default_timer()
transformer.fit(X_train, y_train)
print("Time for model training",timeit.default_timer() - start_time)

Time for model training 140.19156982700224


## Lets now evaluate the accuracy of the model using our hold-out test data

In [13]:
scores= transformer.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, scores)
accuracy

0.8862641138929799

## Lets now save and deploy the model to WML

In [14]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient

In [15]:
wml_credentials = {
                    "url": "https://9.30.250.35",
                    "username": "admin",
                    "password": "password",
                    "instance_id": "icp"               
 }
client = WatsonMachineLearningAPIClient(wml_credentials)
client.deployments.list()

------------------------------------  ----------------------------------------  ------  --------------  ------------------------  -----------------  -------------
GUID                                  NAME                                      TYPE    STATE           CREATED                   FRAMEWORK          ARTIFACT TYPE
548ebcda-8a50-4e53-a41c-13bca6ddbd55  ReturnRiskPandas_CustomTransformers_v0.1  online  DEPLOY_SUCCESS  2019-06-17T05:45:01.842Z  scikit-learn-0.19  model
------------------------------------  ----------------------------------------  ------  --------------  ------------------------  -----------------  -------------


## Before we dploy the model, lets create a custom python runtime with our custom transformer library installed

In [16]:
lib_meta = {
        client.runtimes.LibraryMetaNames.NAME: "CustomTransformers_v0.1",
        client.runtimes.LibraryMetaNames.DESCRIPTION: "CustomTransformers_v0.1",
        client.runtimes.LibraryMetaNames.FILEPATH: "CustTrans-0.2.zip",
        client.runtimes.LibraryMetaNames.VERSION: "1.0",
        client.runtimes.LibraryMetaNames.PLATFORM: {"name": "python", "versions": ["3.6"]}
    }
custom_library_details = client.runtimes.store_library(lib_meta)
custom_library_uid = client.runtimes.get_library_uid(custom_library_details)
print("Custom Library UID: " + custom_library_uid)

Custom Library UID: c277271e-c62e-4c4d-aa4f-2bafb35944cc


In [17]:
runtimes_meta = {
    client.runtimes.ConfigurationMetaNames.NAME: "CustomTransformers_v0.1", 
    client.runtimes.ConfigurationMetaNames.DESCRIPTION: "CustomTransformers_v0.1", 
    client.runtimes.ConfigurationMetaNames.PLATFORM: { "name": "python", "version": "3.6" }, 
    client.runtimes.ConfigurationMetaNames.LIBRARIES_UIDS: [custom_library_uid]
}

In [18]:
runtime_details = client.runtimes.store(runtimes_meta)
runtime_details

{'metadata': {'guid': '88f85e41-be1f-4ee4-9750-378c777c3d01',
  'url': 'https://9.30.250.35:31002/v4/runtimes/88f85e41-be1f-4ee4-9750-378c777c3d01',
  'created_at': '2019-10-22T08:33:05.514Z'},
 'entity': {'name': 'CustomTransformers_v0.1',
  'description': 'CustomTransformers_v0.1',
  'custom_libraries': [{'name': 'CustomTransformers_v0.1',
    'version': '1.0',
    'url': 'https://9.30.250.35:31002/v4/libraries/c277271e-c62e-4c4d-aa4f-2bafb35944cc'}],
  'content_url': 'https://9.30.250.35:31002/v4/runtimes/88f85e41-be1f-4ee4-9750-378c777c3d01/content',
  'platform': {'name': 'python', 'version': '3.6'}}}

In [19]:
runtime_url = client.runtimes.get_url(runtime_details)
runtime_uid = client.runtimes.get_uid(runtime_details)
print("Runtimes URL: " + runtime_url)
print("Runtimes UID: " + runtime_uid)

Runtimes URL: https://9.30.250.35:31002/v4/runtimes/88f85e41-be1f-4ee4-9750-378c777c3d01
Runtimes UID: 88f85e41-be1f-4ee4-9750-378c777c3d01


In [20]:
model_props = {client.repository.ModelMetaNames.NAME: "ReturnRiskPandas_v0.1",
               client.repository.ModelMetaNames.RUNTIME_UID: runtime_uid
              }

In [21]:
published_model = client.repository.store_model(model=transformer, meta_props=model_props,training_data=X_train, training_target=y_train)
published_model_uid = client.repository.get_model_uid(published_model)
model_details = client.repository.get_details(published_model_uid)

In [22]:
import json
print(json.dumps(model_details, indent=2))

{
  "metadata": {
    "guid": "c1261ed8-01ca-4340-9f20-acfdd2f5bdc0",
    "url": "https://9.30.250.35:31002/v3/wml_instances/999/published_models/c1261ed8-01ca-4340-9f20-acfdd2f5bdc0",
    "created_at": "2019-10-22T08:33:12.256Z",
    "modified_at": "2019-10-22T08:33:13.926Z"
  },
  "entity": {
    "runtime_environment": "python-3.6",
    "learning_configuration_url": "https://9.30.250.35:31002/v3/wml_instances/999/published_models/c1261ed8-01ca-4340-9f20-acfdd2f5bdc0/learning_configuration",
    "name": "ReturnRiskPandas_v0.1",
    "label_col": "RETURN_FLAG",
    "learning_iterations_url": "https://9.30.250.35:31002/v3/wml_instances/999/published_models/c1261ed8-01ca-4340-9f20-acfdd2f5bdc0/learning_iterations",
    "training_data_schema": {
      "features": {
        "type": "DataFrame",
        "fields": [
          {
            "name": "BASKET_SIZE",
            "type": "int64"
          },
          {
            "name": "EXTN_COMPOSITION",
            "type": "category"
        

In [23]:
created_deployment = client.deployments.create(published_model_uid, name="ReturnRiskPandas_CustomTransformers_v0.2")



#######################################################################################

Synchronous deployment creation for uid: 'c1261ed8-01ca-4340-9f20-acfdd2f5bdc0' started

#######################################################################################


INITIALIZING
DEPLOY_IN_PROGRESS....................................................
DEPLOY_SUCCESS


------------------------------------------------------------------------------------------------
Successfully finished deployment creation, deployment_uid='63873816-3872-4d97-94c8-5cf7c307cb2c'
------------------------------------------------------------------------------------------------




## Now that we have deployed the model lets get the scroing url for the REST-API interface

In [24]:
scoring_endpoint = client.deployments.get_scoring_url(created_deployment)
print(scoring_endpoint)

https://9.30.250.35:32006/v3/scoring/online/63873816-3872-4d97-94c8-5cf7c307cb2c


## Here we have a sample payload. Lets try scoring this using our model

In [25]:
scoring_payload={"fields":["BASKET_SIZE","EXTN_COMPOSITION","CARRIER_SERVICE_CODE_OL","CATEGORY","COUNTRY_OF_ORIGIN_OI","DAY_OF_MONTH","DAY_OF_WEEK","DAY_OF_YEAR","EXTN_BRAND","EXTN_CHRISTMAS_GUARANTEE","EXTN_DISCOUNT_ID","EXTN_IS_FAST_DELIVERY","EXTN_IS_GIFT","EXTN_IS_PREORDER","EXTN_SHIP_TO_CITY","EXTN_SHIP_TO_COUNTRY","EXTN_SEASON","LIST_PRICE","MONTH_OF_YEAR","OTHER_CHARGES","OTHER_CHARGES_OL","REQ_DELIVERY_DATE","TOTAL_AMOUNT_USD","WEEKEND","ZIP_CODE","MTS_CTS","HOUR_OF_DAY","LOCKID"],"values":[[3, '91% Nylon, 9% Elastercell', 'STANDARD', 'Bikini', 'US', 18, 'Saturday', 322, 'XYZAI', 'None', 'N', 'N', 'Los Angeles', 'US', 'FW17', 75, 11, 0.0, 0.0, 0, 165.35, 1, 'Zipcode_401', 24, 19, 277]]}

In [26]:
prediction = client.deployments.score(scoring_endpoint, scoring_payload)

In [27]:
prediction

{'values': [[0, [0.6333333333333333, 0.36666666666666664]]],
 'fields': ['prediction', 'probability']}