# Deployment

In [181]:
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pickle

In [182]:
df = pd.read_csv('customer-churn.csv')

In [183]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data Cleaning

In [184]:
def standardise_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df = df.copy()
  df.columns = df.columns.str.lower().str.replace(' ', '_')
  return df

In [185]:
def standardise_str_cols(df: pd.DataFrame) -> pd.DataFrame:
  df = df.copy()

  for col in df.columns:
    if is_object_dtype(df[col]):
      df[col] = df[col].str.lower().str.replace(' ', '_')
  
  return df

In [186]:
df = standardise_colnames(df)
df = standardise_str_cols(df)

In [187]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce').fillna(0)
df.churn = (df.churn == 'yes').astype('int')

In [188]:
df.drop('customerid', axis=1, inplace=True)

## Validation Framework

In [189]:
df_train_full, df_test, y_train_full, y_test = train_test_split(
  df.drop('churn', axis=1), df.churn, test_size=.2, random_state=1)
df_train, df_val, y_train, y_val = train_test_split(
  df_train_full, y_train_full, test_size=.25, random_state=1)

In [190]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(4225, 19)
(1409, 19)
(1409, 19)


In [191]:
df_train_full.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [192]:
y_train_full = y_train_full.values
y_train = y_train.values
y_val = y_val.values
y_test = y_test.values

In [193]:
type(df_train), type(y_train)

(pandas.core.frame.DataFrame, numpy.ndarray)

## Feature Preparation

In [175]:
def encode_vars(df: pd.DataFrame, dv: DictVectorizer = None):
  df_dicts = df.to_dict(orient='records')

  if not dv:
    dv = DictVectorizer(sparse=False)
    dv.fit(df_dicts)
  
  encoded_data = dv.transform(df_dicts)

  return encoded_data, dv

In [176]:
X_train, dv = encode_vars(df_train)
X_val, _ = encode_vars(df_val, dv)
X_test, _ = encode_vars(df_test, dv)

In [177]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(4225, 45)
(1409, 45)
(1409, 45)


In [179]:
type(X_train), type(y_train)

(numpy.ndarray, numpy.ndarray)

## Logistic Regression

In [180]:
def train(df: pd.DataFrame, y_train: np.array, C=1.0):
  X_train, dv = encode_vars(df)
  model = LogisticRegression(max_iter=10000, C=C)
  model.fit(X_train, y_train)

  return dv, model

In [17]:
def predict(df: np.array, dv, model):
  X, _ = encode_vars(df, dv)
  y_pred = model.predict_proba(X)[:, 1]

  return y_pred

In [18]:
C = 1.0
n_splits = 5

In [19]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_train_full):
  df_train = df_train_full.iloc[train_idx]
  df_val = df_train_full.iloc[val_idx]

  y_train = y_train_full[train_idx]
  y_val = y_train_full[val_idx]

  dv, model = train(df_train, y_train, C=C)
  y_pred = predict(df_val, dv, model)

  auc = roc_auc_score(y_val, y_pred)
  scores.append(auc)
print(f'C={C} {np.mean(scores):.3f} {np.std(scores):.3f}')

C=1.0 0.842 0.007


In [20]:
scores

[0.8443767613096687,
 0.8449522414249768,
 0.8335460565924142,
 0.8347289281831554,
 0.8517343352891344]

In [21]:
dv, model = train(df_train_full, y_train_full, C=C)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
auc

0.8584627926376114

## Model Deployment
1. train a model in a notebook and save to a binary file
2. load this file from a web service that accepts requests with new data
3. isolate the dependencies for the web service from other services
4. isolate system dependencies (Docker)
5. deploy the model to the cloud (AWS EB)

### Saving the model

In [26]:
output_file = f'model_C={C}.bin'

In [27]:
with open(output_file, 'wb') as f_out:
  pickle.dump((dv, model), f_out)

## Loading the model

In [31]:
input_file = f'model_C={C}.bin'

In [32]:
with open(input_file, 'rb') as f_in:
  dv, model = pickle.load(f_in)

In [33]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=10000))

In [34]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

In [37]:
X = dv.transform([customer])
y_pred = model.predict_proba(X)[0, 1]

