This is a starter notebook for an updated module 5 of ML Zoomcamp

The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')
print(f'sklearn=={sklearn.__version__}')

pandas==2.3.1
numpy==2.3.1
sklearn==1.7.0


In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
data_url = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

df = pd.read_csv(data_url)

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)

In [5]:
y_train = df.churn

In [6]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [7]:
dv = DictVectorizer()

train_dict = df[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [8]:
train_dict[0:3]

[{'gender': 'female',
  'seniorcitizen': 0,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'no',
  'multiplelines': 'no_phone_service',
  'internetservice': 'dsl',
  'onlinesecurity': 'no',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'electronic_check',
  'tenure': 1,
  'monthlycharges': 29.85,
  'totalcharges': 29.85},
 {'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'no',
  'deviceprotection': 'yes',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'one_year',
  'paperlessbilling': 'no',
  'paymentmethod': 'mailed_check',
  'tenure': 34,
  'monthlycharges': 56.95,
  'totalcharges': 1889.5},
 {'gender': 'male',
  'seniorc

In [9]:
customer = {
  'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'mailed_check',
  'tenure': 6,
  'monthlycharges': 53.85,
  'totalcharges': 163.55
}

In [10]:
X_customer = dv.transform(customer)

In [11]:
churn = model.predict_proba(X_customer)[0, 1]

In [12]:
import pickle

In [13]:
with open("model.bin", "wb") as f_out:
    pickle.dump((dv, model), f_out)

In [14]:
with open("model.bin", "rb") as f_in:
    (dv, model) = pickle.load(f_in)

In [15]:
customer = {
  'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'mailed_check',
  'tenure': 6,
  'monthlycharges': 23.85,
  'totalcharges': 263.55
}

X_customer = dv.transform(customer)
churn = model.predict_proba(X_customer)[0, 1]

if churn >= 0.5:
    print("Send email with promo")
else:
    print("Don't do anything")

Don't do anything


In [16]:
from sklearn.pipeline import make_pipeline

In [17]:
pipeline = make_pipeline(
    DictVectorizer(),
    LogisticRegression(solver='liblinear')
)

In [18]:
train_dict = df[categorical + numerical].to_dict(orient='records')

model = pipeline.fit(train_dict, y_train)

In [19]:
with open("model.bin", "wb") as f_out:
    pickle.dump(pipeline, f_out)

In [20]:
with open("model.bin", "rb") as f_in:
    pipeline = pickle.load(f_in)

In [21]:
customer = {
  'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'mailed_check',
  'tenure': 6,
  'monthlycharges': 23.85,
  'totalcharges': 263.55
}

churn = pipeline.predict_proba(customer)[0, 1]
print("Probability of churn = ", churn)

if churn >= 0.5:
    print("Send email with promo")
else:
    print("Don't do anything")

Probability of churn =  0.22286804024539306
Don't do anything


In [22]:
for c in categorical:
    print(df[c].value_counts())
    print()

for n in numerical:
    print(df[n].describe())
    print()

gender
male      3555
female    3488
Name: count, dtype: int64

seniorcitizen
0    5901
1    1142
Name: count, dtype: int64

partner
no     3641
yes    3402
Name: count, dtype: int64

dependents
no     4933
yes    2110
Name: count, dtype: int64

phoneservice
yes    6361
no      682
Name: count, dtype: int64

multiplelines
no                  3390
yes                 2971
no_phone_service     682
Name: count, dtype: int64

internetservice
fiber_optic    3096
dsl            2421
no             1526
Name: count, dtype: int64

onlinesecurity
no                     3498
yes                    2019
no_internet_service    1526
Name: count, dtype: int64

onlinebackup
no                     3088
yes                    2429
no_internet_service    1526
Name: count, dtype: int64

deviceprotection
no                     3095
yes                    2422
no_internet_service    1526
Name: count, dtype: int64

techsupport
no                     3473
yes                    2044
no_internet_service    15