In [137]:
import boto3
import mlflow
from mlflow import pyfunc as ml_pyfunc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [138]:
########################################################
### Import Dataset
########################################################

In [139]:
## Import dataset
leads_dataset = pd.read_csv('data/leads_cleaned.csv')
leads_dataset.columns = map(str.lower, leads_dataset.columns)

In [140]:
########################################################
### Clean and Prepare Data
########################################################

In [141]:
# Create data pre-processing steps before plugging into model
leads_categorical_columns = ['lead origin',
                             'lead source',
                             'last activity',
                             'specialization',
                             'what is your current occupation',
                             'what matters most to you in choosing a course',
                             'city',
                             'last notable activity']

leads_numeric_columns = ['totalvisits',
                         'total time spent on website',
                         'page views per visit']

leads_response_columns = ['converted']

In [142]:
#split data for training, remove extras

leads_x = leads_dataset.drop(leads_response_columns, axis=1)
leads_y = leads_dataset[leads_response_columns]

leads_x_train, leads_x_test, leads_y_train, leads_y_test = train_test_split(leads_x,
                                                                            leads_y,
                                                                            train_size=0.7,
                                                                            test_size=0.3,
                                                                            random_state=5050)

In [143]:
scaler = StandardScaler()
scaler = scaler.fit(leads_x_train[leads_numeric_columns])

In [144]:
def pre_process_leads_data(df,
                           numeric_columns,
                           categorical_columns,
                           fitted_scaler,
                           train_df_columns = None):
    ## create new df with selected columns
    df.columns = map(str.lower, df.columns)
    _df = df[set(numeric_columns + categorical_columns)].copy()
    
    ## scale the numeric columns with the pre-built scaler
    _df[numeric_columns] = fitted_scaler.transform(_df[numeric_columns])
         
    # First, make categorical text lowercase
    _df[categorical_columns] = _df[categorical_columns].apply(lambda x: x.str.lower())
    # Next, create one-hot-encoded variables, add to dataframe, drop old columns
    _df_dummies = pd.get_dummies(_df[categorical_columns], drop_first=True)
    _df = pd.concat([_df, _df_dummies], axis=1)
    _df.drop(categorical_columns, axis=1, inplace = True)

    if train_df_columns:
        _df = _df.reindex(columns=train_df_columns, fill_value=0)

    return _df

In [145]:
leads_x_train_clean = pre_process_leads_data(df = leads_x_train,
                                            numeric_columns = leads_numeric_columns,
                                            categorical_columns = leads_categorical_columns,
                                            fitted_scaler = scaler)

leads_x_test_clean = pre_process_leads_data(df = leads_x_test,
                                           numeric_columns = leads_numeric_columns,
                                           categorical_columns = leads_categorical_columns,
                                           fitted_scaler = scaler,
                                           train_df_columns = leads_x_train_clean.columns.tolist())

In [146]:
leads_x_train_clean

Unnamed: 0,total time spent on website,totalvisits,page views per visit,lead origin_landing page submission,lead origin_lead add form,lead origin_lead import,lead source_blog,lead source_click2call,lead source_direct traffic,lead source_facebook,...,last notable activity_form submitted on website,last notable activity_had a phone conversation,last notable activity_modified,last notable activity_olark chat conversation,last notable activity_page visited on website,last notable activity_resubscribed to emails,last notable activity_sms sent,last notable activity_unreachable,last notable activity_unsubscribed,last notable activity_view in browser link clicked
626,-0.883085,-0.685814,-1.078980,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2286,-0.122857,-0.290676,-0.165702,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1776,1.491250,-0.093107,-0.394021,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8336,-0.883085,-0.685814,-1.078980,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4663,0.916488,0.104462,-0.165702,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3153,-0.883085,-0.685814,-1.078980,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5491,2.106410,-0.290676,-0.165702,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5748,-0.816978,0.302031,0.062618,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5271,-0.407484,0.697168,-0.015011,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [147]:
########################################################
### Train and Evaluate Model
########################################################

In [148]:
## Train the random forest model
num_estimators = 100
min_samples = 4

rf = RandomForestClassifier(n_estimators=num_estimators,
                            min_samples_split=min_samples)
rf.fit(leads_x_train_clean, leads_y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [149]:
leads_y_test_predicted = rf.predict(leads_x_test_clean)

accuracy = metrics.accuracy_score(leads_y_test, leads_y_test_predicted)
auc_score = metrics.roc_auc_score(leads_y_test, leads_y_test_predicted)

print(accuracy)
print(auc_score)

0.8185824458318032
0.7980164296888982


In [150]:
########################################################
### MLflow and environment setup
########################################################

In [151]:
# connect to MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("LeadScoringProcessed") # creates an experiment if it doesn't exist

In [152]:
# define specific python and package versions for environment
mlflow_conda_env = {
 'name': 'mlflow-env',
 'channels': ['defaults'],
 'dependencies': ['python=3.6.2', {'pip': ['mlflow==1.6.0','scikit-learn','cloudpickle==1.3.0']}]
}

In [153]:
########################################################
### Define Model
########################################################

In [154]:
class leadsModel(mlflow.pyfunc.PythonModel):
   
    ## defining objects needed for leadsModel prediction. 
    def __init__(self,
                 train_df_columns,
                 model,
                 leads_categorical_columns,
                 leads_numeric_columns,
                 fitted_scaler,
                 pre_process_leads_data):
        
        ## Setting up all needed objects
        self.train_df_columns = train_df_columns
        self.model = model
        self.leads_categorical_columns = leads_categorical_columns
        self.leads_numeric_columns = leads_numeric_columns
        self.fitted_scaler = fitted_scaler
        self.pre_process_leads_data = pre_process_leads_data
    
    ## define function with processing and feeding data into prediction at the end
    def predict(self,context,model_input):
        
        # make sure all inputted columns are lowercase
        model_input.columns = map(str.lower, model_input.columns)
        
        # run inputted dataset through our processing function
        # note: we are excluding the response columns here since not needed for deploy
        model_input_processed = self.pre_process_leads_data(
                                   df = model_input,
                                   numeric_columns = self.leads_numeric_columns,
                                   categorical_columns = self.leads_categorical_columns,
                                   fitted_scaler = self.fitted_scaler,
                                   train_df_columns = self.train_df_columns)       
        
        # finally input the cleaned/adjusted dataset into our model for prediction
        return self.model.predict(model_input_adjusted)

In [155]:
########################################################
### Log Model to MLflow
########################################################

In [156]:
# start mlflow run, log parameters, metrics, and the model
with mlflow.start_run(run_name="Leads Model with Processing") as run:
    # log the parameters that we defined for the model training
    mlflow.log_param("num_estimators", num_estimators)
    mlflow.log_param("min_samples", min_samples)
    
    # log the performance metrics that we calculated earlier
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("auc_score", auc_score)
    
    # log model with all objects referenced in the leadsModel class
    ml_pyfunc.log_model(
        artifact_path = "leads_pyfunc",
        python_model = leadsModel(train_df_columns = leads_x_train_clean.columns.tolist(),
                                  model = rf,
                                  leads_categorical_columns = leads_categorical_columns,
                                  leads_numeric_columns = leads_numeric_columns,
                                  fitted_scaler = scaler,
                                  pre_process_leads_data = pre_process_leads_data
                                 ),
        conda_env = mlflow_conda_env
    )
    
    # save run_id and experiment_id for deployment
    run_id = run.info.run_uuid
    experiment_id = run.info.experiment_id
    
    # end the mlflow run!
    mlflow.end_run()

In [157]:
########################################################
### Deploy Model to Sagemaker
########################################################

In [134]:
## Note: this requires a MLflow pyfunc docker container to already exist in sagemaker

import mlflow.sagemaker as mfs


# we pull the run and experiment id's from above to create this mlflow location
model_uri = "mlruns2/%s/%s/artifacts/leads_pyfunc" % (experiment_id,run_id)

# The region is chosen, pick whats close to you or your systems!
region = "us-east-1"
# The aws account id can be found in the console
aws_account_id = "XXXXXXX"
# We use these inputs to automatically reference the sagemaker docker container
image_url = aws_account_id \
            + ".dkr.ecr." \
            + region \
            + ".amazonaws.com/mlflow-pyfunc:1.5.0"

# now we specify the role that we setup for sagemaker in the previous step
sagemaker_arn = "arn:aws:iam::XXXXXXX:role/AmazonSageMakerFullAccess"


# finally, we pick a name for our endpoint within sagemaker
endpoint_name = "leads-rf-1" 


# with all of the inputs, we run the following to deploy the model it sagemaker
mfs.deploy(app_name=endpoint_name, 
           model_uri=model_uri,
           region_name=region,
           mode="create", #this should change to replace if the endpoint already exists
           execution_role_arn=sagemaker_arn,
           image_url=image_url, 
           instance_type='ml.t2.medium') # smallest/cheapest sagemaker allowed size

Writing deploy_sagemaker.py
