# Customer churn analysis


# Machine Learning Pipeline

In the following notebooks, we will go through the implementation of each one of the steps in the Machine Learning Pipeline. 

We will discuss:

1. Data Preparation and Analysis
2. **Feature Engineering**
3. **Feature Selection**
4. **Model Training**
5. **Obtaining Predictions / Scoring**

In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import *
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
from snowflake.snowpark.functions import udf
%matplotlib inline
import datetime as dt
import numpy as np
import seaborn as sns

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to save the trained scaler class
import joblib



In [2]:
#Snowflake connection info
from config import snowflake_conn_prop
from snowflake.snowpark import version
print(version.VERSION)

session = Session.builder.configs(snowflake_conn_prop).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

(0, 11, 0)
[Row(CURRENT_WAREHOUSE()='WH_UWM', CURRENT_DATABASE()='UWM_HOUSEHOLDING', CURRENT_SCHEMA()='TEAM1_XURAN')]


In [3]:
%%time
raw = session.table('TABLE_FOR_TRAINING')
data = raw.toPandas()

CPU times: total: 547 ms
Wall time: 1.01 s


In [4]:
def encode(s: str):
        if "None" in s:
            s = s.replace("None", " ")
        elif re.search(r'Ave[.]*$', s):
            s = s.replace("Ave", "Avenue")
        elif re.search(r'Av[.]*$', s):
            s = s.replace("Av", "Avenue")
        elif re.search(r'St[.]*$', s):
            s = s.replace("St", "Street")
        elif re.search(r'Rd[.]*$', s):
            s = s.replace("Rd", "Road")
        elif re.search(r'Dr[.]*$', s):
            s = s.replace('Dr', "Drive")
        return s
data["P1_ADDRESS_LINE_1"]= data.apply(lambda row : encode(str(row["P1_ADDRESS_LINE_1"])), axis = 1).map(str)
data["P1_ADDRESS_LINE_3"]= data.apply(lambda row : encode(str(row["P1_ADDRESS_LINE_3"])), axis = 1).map(str)
data["P2_ADDRESS_LINE_1"]= data.apply(lambda row : encode(str(row["P2_ADDRESS_LINE_1"])), axis = 1).map(str)
data["P2_ADDRESS_LINE_3"]= data.apply(lambda row : encode(str(row["P2_ADDRESS_LINE_3"])), axis = 1).map(str)


# Separate dataset into train and test

It is important to separate our data intro training and testing set. 

When we engineer features, some techniques learn parameters from data. It is important to learn these parameters only from the train set. This is to avoid over-fitting.

In [5]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

# to divide train and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['P1_CONTACT_ID','P2_CONTACT_ID',
                       'P1_HOUSEHOLD_ID','P2_HOUSEHOLD_ID','IFFAMILY',
                       'P1_FIRSTNAME', 'P1_MIDDLENAME','P2_FIRSTNAME', 'P2_MIDDLENAME',
                       'P1_ADDRESS_LINE_2', 'P2_ADDRESS_LINE_2','P1_LASTNAME',  'P2_LASTNAME'], axis=13), # predictive variables
    data['IFFAMILY'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=2, # we are setting the seed here
)

X_train.shape, X_test.shape,y_train.shape, y_test.shape



((1601, 10), (401, 10), (1601,), (401,))

## let's identify the different variables

In [6]:
# Since we already cleaned up the data using snowpark, identifying variable is super easy

# we will capture those of type *object*

cat_vars = [ 'P1_ADDRESS_LINE_1', 'P1_ADDRESS_LINE_3','P2_ADDRESS_LINE_1',  'P2_ADDRESS_LINE_3', 
           'P1_CITY', 'P1_STATE','P1_ZIP','P2_CITY', 'P2_STATE','P2_ZIP']

# we will capture those of type numerical from previous notebook
num_vars = [ ]

features = cat_vars + num_vars
print(features)

['P1_ADDRESS_LINE_1', 'P1_ADDRESS_LINE_3', 'P2_ADDRESS_LINE_1', 'P2_ADDRESS_LINE_3', 'P1_CITY', 'P1_STATE', 'P1_ZIP', 'P2_CITY', 'P2_STATE', 'P2_ZIP']


## Numerical variable transformation

In the previous notebook, we observed that the numerical variables are not normally distributed.

We will transform with the MinMaxScaler in order to get a more Gaussian-like distribution. Use ordinal encoding for the categorical variables and check for nulls


In [None]:
# check absence of na in the train set
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

# check absence of na in the test set
[var for var in X_test.columns if X_test[var].isnull().sum() > 0]

[]

In [None]:
# setup pipeline

#transformations
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

#Classifier
from sklearn.ensemble import RandomForestClassifier

#Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

#Model Accuracy
from sklearn.metrics import balanced_accuracy_score

# Model Pipeline
ord_pipe = make_pipeline(
    FunctionTransformer(lambda x: x.astype(str)) ,
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    )

num_pipe = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0),
    MinMaxScaler()
    )

clf = make_pipeline(RandomForestClassifier(random_state=0, n_jobs=-1))

model = make_pipeline(ord_pipe, num_pipe, clf)

# fit the model
model.fit(X_train, y_train)



In [None]:
import dill as pickle
with open('model.pkl' ,'wb') as f:
    pickle.dump(model,f)

## Check Accuracy of our model on test dataset

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict_proba(X_test)[:,1]
predictions = [str(round(value))for value in y_pred]
TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()
print("Model testing completed.\n   - Model Balanced Accuracy: %.2f%%" % ((TP+TN) /(TP+FP+TN+FN) * 100.0))

## Confusion Matrix

In [None]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predictions)

TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy =  (TP+TN) /(TP+FP+TN+FN)

print('Accuracy of the classification = {:0.3f}'.format(accuracy))

## Check for important features

In [None]:
# Feature importance
from sklearn.inspection import permutation_importance
perm_importance = permutation_importance(model, X_test, y_test)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(np.array(X_test.columns)[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Feature Importance")

In [None]:
session.close()