In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.columns = df.columns.str.lower().str.replace(' ','_')

categorial_cols = list(df.dtypes[df.dtypes == 'object'].index)

for col in categorial_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')
# Convert total_charges to numerical values, ignore errors as NA
total_charges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = total_charges
df.totalcharges = df.totalcharges.fillna(0)
# Filter dataframe by specific column, pick specific columns
df[df['totalcharges'].isnull()][['customerid', 'totalcharges']]
df.churn = (df.churn == 'yes').astype('int')

In [4]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
y_test = df_test.churn.values
# del df_full_train['churn']


numerical = ['tenure', 'monthlycharges', 'totalcharges']
categorial = ['gender', 'seniorcitizen', 'partner', 'dependents',
     'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [5]:
def train(df, y_train, C=1.0):
    dicts = df[categorial + numerical].to_dict(orient='records')
    dv = DictVectorizer(sparse = False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

def predict(df_train, dv, model):
    dicts = df_train[categorial + numerical].to_dict(orient='records')
    X = dv.transform(dicts)
    
    y_pred = model.predict_proba(X)[:,1]
    
    return y_pred

In [6]:
C = 1.0
n_splits = 5

In [7]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

In [8]:
scores = []
for train_idx, val_idx in kfold.split(df_full_train) :
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    ## Train the model on the training dataset
    dv, model = train(df_train, y_train, C=C)
    ## Validate the model on the validation dataset
    y_pred = predict(df_val, dv, model)
    
    ## Evaluate the model accuracy
    scores.append(roc_auc_score(y_val, y_pred))

In [9]:
scores

[0.8437723230055499,
 0.8437437922129519,
 0.8311780052177403,
 0.8301684306452645,
 0.8509028583979168]

In [10]:
## Train the model on the full training dataset
dv, model = train(df_full_train, df_full_train.churn.values, C=1)
## Validate the model on the test dataset
y_test_pred = predict(df_test, dv, model )

auc = roc_auc_score(y_test, y_test_pred)
auc

0.8572386167896259

***Save the model***

In [11]:
import pickle

In [12]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [13]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)
    f_out.close()

***Load the model***

In [14]:
import pickle

In [15]:
model_file = 'model_C=1.0.bin'

In [16]:
with open(model_file, 'rb') as f_in:
    dv, model =  pickle.load(f_in) 


In [17]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(C=1, max_iter=1000))

In [22]:
test_customer = {
 'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'no',
 'tenure': 67,
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'no',
 'contract': 'one_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'monthlycharges': 88.4,
 'totalcharges': 5798.3 
}

X = dv.transform([test_customer])
model.predict_proba(X)[:,1]

array([0.13215883])