In [1]:
# Importing packages
import numpy as np
import pandas as pd 

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset.
telco_data = pd.read_csv('data/telco_customer_churn.csv')

# Checking dataframe.
telco_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Cleaning up columns names and categorical variables. 
columns = telco_data.columns.str.lower().str.replace(' ', '_')

# changeing columns names 
telco_data.columns = columns

# Checking for changes made. 
telco_data.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [4]:
# Categorical Features. 
categorical = ['customerid', 'gender', 'partner', 'dependents', 'phoneservice',
       'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
       'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
       'contract', 'paperlessbilling', 'paymentmethod',
       'churn']

# Converting to lower case and removing space. 
for c in categorical: 
    telco_data[c] = telco_data[c].str.lower().str.replace(' ', '_')
    print(c)

customerid
gender
partner
dependents
phoneservice
multiplelines
internetservice
onlinesecurity
onlinebackup
deviceprotection
techsupport
streamingtv
streamingmovies
contract
paperlessbilling
paymentmethod
churn


In [5]:
# Checking dataframe.
telco_data.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [6]:
# Converting totalcharge column to number. 
telco_data['totalcharges'] = pd.to_numeric(telco_data['totalcharges'], errors='coerce')

# Filling null values with 0. 
telco_data['totalcharges'] = telco_data['totalcharges'].fillna(0)

# checking for null. 
telco_data['totalcharges'].isna().sum()

0

In [7]:
# converting churn to int
telco_data['churn'] = (telco_data['churn']== 'yes').astype(int)

# Checking churn features.
telco_data.churn.value_counts()

0    5174
1    1869
Name: churn, dtype: int64

In [8]:
# Setting up data validation dataset. 
telco_train, telco_test = train_test_split(telco_data, test_size = 0.20, random_state = 1)

telco_train = telco_train.reset_index()
telco_test = telco_test.reset_index()

# Creating y features. 
y_train = telco_train['churn'].values
y_test = telco_test['churn'].values

# Deleting churn from dataset. 
del telco_train['churn']
del telco_test['churn']

In [9]:
telco_test

Unnamed: 0,index,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,...,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
0,3381,8879-zkjof,female,0,no,no,41,yes,no,dsl,...,no,yes,yes,yes,yes,one_year,yes,bank_transfer_(automatic),79.85,3320.75
1,6180,0201-mibol,female,1,no,no,66,yes,yes,fiber_optic,...,no,no,no,yes,yes,two_year,yes,bank_transfer_(automatic),102.40,6471.85
2,4829,1600-dilpe,female,0,no,no,12,yes,no,dsl,...,no,no,no,no,no,month-to-month,yes,bank_transfer_(automatic),45.00,524.35
3,3737,8601-qacrs,female,0,no,no,5,yes,yes,dsl,...,no,no,no,no,no,month-to-month,yes,mailed_check,50.60,249.95
4,4249,7919-zodzz,female,0,yes,yes,10,yes,no,dsl,...,yes,yes,no,no,yes,one_year,yes,mailed_check,65.90,660.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,2563,5130-iekqt,male,1,no,no,25,yes,yes,fiber_optic,...,yes,yes,no,yes,yes,month-to-month,no,mailed_check,105.95,2655.25
1405,2028,4452-rohmo,female,0,no,no,15,yes,no,no,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.60,331.60
1406,2899,6164-haqtx,male,0,no,no,71,no,no_phone_service,dsl,...,yes,yes,yes,yes,no,two_year,no,bank_transfer_(automatic),53.95,3888.65
1407,3474,3982-dqlus,male,1,yes,yes,65,yes,yes,fiber_optic,...,yes,no,no,no,no,month-to-month,yes,electronic_check,85.75,5688.45


In [10]:
# Defining a training function. 
def train(df_train, df_y, c = 1.0):
    model = LogisticRegression(C= c , max_iter = 1000)
    dv = DictVectorizer(sparse = False)
    
    dict_train = df_train.to_dict(orient = 'records')
    x_train = dv.fit_transform(dict_train)
    
    model.fit(x_train, df_y)
    
    return model, dv

In [11]:
# Defining prediction function. 
def predict(df_val, model, dv): 
    dict_val = df_val.to_dict(orient = 'records')
    x_val = dv.transform(dict_val)
    
    y_pred = model.predict_proba(x_val)[:, 1]
    
    return y_pred

In [12]:
# Setting parameters.

# Logistic regression parameter.
c = 0.1

In [13]:
# Training model.
model, dv = train(telco_train, y_train, c= c)

# Model prediction.
y_pred = predict(telco_test,model, dv)

# Evaluating prediction with auc. 
auc = roc_auc_score(y_test, y_pred)

print(auc)

0.8567524673101715


In [14]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75
}

In [15]:
def predict_customer(customer, model, dv):
    x = dv.transform(customer)
    y_pred = model.predict_proba(x)[:, 1]
    return y_pred[0].round(3)

In [16]:
# Predicting a single customer
result = predict_customer(customer, model, dv)

# Result of prediction.
result

0.058

In [19]:
# Importing pickle package.
import pickle

In [18]:
# file tile
model = f'model_C={c}.bin'

In [20]:
# writting model to .bin file. 
with open(model, 'wb') as f_out: 
    pickle.dump((model, dv), f_out)