# Classification

In [58]:
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
import random

In [59]:
df = pd.read_csv('customer-churn.csv')

In [60]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data Cleaning

In [61]:
def standardise_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df = df.copy()
  df.columns = df.columns.str.lower().str.replace(' ', '_')
  return df

In [62]:
def standardise_str_cols(df: pd.DataFrame) -> pd.DataFrame:
  df = df.copy()

  for col in df.columns:
    if is_object_dtype(df[col]):
      df[col] = df[col].str.lower().str.replace(' ', '_')
  
  return df

In [63]:
df = standardise_colnames(df)
df = standardise_str_cols(df)

In [64]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce').fillna(0)
df.churn = (df.churn == 'yes').astype('int')

In [65]:
df.drop('customerid', axis=1, inplace=True)

## Validation Framework

In [66]:
y = df.churn

In [67]:
df_train_full, df_test, y_train_full, y_test = train_test_split(
  df.drop('churn', axis=1), y, test_size=.2, random_state=1)
df_train, df_val, y_train, y_val = train_test_split(
  df_train_full, y_train_full, test_size=.25, random_state=1)

In [68]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(4225, 19)
(1409, 19)
(1409, 19)


In [69]:
df_train_full.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

## Exploratory Data Analysis

In [70]:
df.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [71]:
df.churn.value_counts(normalize=True)

churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64

In [72]:
global_churn_rate = round(df.churn.mean(), 2)
global_churn_rate

0.27

In [73]:
df.select_dtypes(exclude='number').nunique()

gender              2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature Importance
#### Churn Rate

In [74]:
df.groupby(['gender']).churn.agg(['mean', 'count']) / global_churn_rate # risk ratio

Unnamed: 0_level_0,mean,count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.997069,12918.518519
male,0.968901,13166.666667


In [75]:
df.groupby(['partner']).churn.agg(['mean', 'count']) / global_churn_rate # risk ratio

Unnamed: 0_level_0,mean,count
partner,Unnamed: 1_level_1,Unnamed: 2_level_1
no,1.220666,13485.185185
yes,0.72833,12600.0


In [76]:
from IPython.display import display

In [77]:
categorical_vars = df.select_dtypes(exclude='number')

In [78]:
for c in categorical_vars:
  df_group = df.groupby(c).churn.agg(['mean', 'count'])
  df_group['diff'] = df_group['mean'] - global_churn_rate
  df_group['risk'] = df_group['mean'] / global_churn_rate
  display(df_group)
  print()

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.269209,3488,-0.000791,0.997069
male,0.261603,3555,-0.008397,0.968901





Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.32958,3641,0.05958,1.220666
yes,0.196649,3402,-0.073351,0.72833





Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.312791,4933,0.042791,1.158487
yes,0.154502,2110,-0.115498,0.572231





Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.249267,682,-0.020733,0.923211
yes,0.267096,6361,-0.002904,0.989246





Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.250442,3390,-0.019558,0.927565
no_phone_service,0.249267,682,-0.020733,0.923211
yes,0.286099,2971,0.016099,1.059626





Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.189591,2421,-0.080409,0.702189
fiber_optic,0.418928,3096,0.148928,1.551584
no,0.07405,1526,-0.19595,0.274259





Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.417667,3498,0.147667,1.546916
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.146112,2019,-0.123888,0.541155





Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.399288,3088,0.129288,1.478843
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.215315,2429,-0.054685,0.797463





Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.391276,3095,0.121276,1.449171
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.225021,2422,-0.044979,0.83341





Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.416355,3473,0.146355,1.542055
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.151663,2044,-0.118337,0.561716





Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.335231,2810,0.065231,1.241597
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.300702,2707,0.030702,1.113711





Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.336804,2785,0.066804,1.247423
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.299414,2732,0.029414,1.108942





Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.427097,3875,0.157097,1.58184
one_year,0.112695,1473,-0.157305,0.41739
two_year,0.028319,1695,-0.241681,0.104884





Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.163301,2872,-0.106699,0.604818
yes,0.335651,4171,0.065651,1.243152





Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.167098,1544,-0.102902,0.618883
credit_card_(automatic),0.152431,1522,-0.117569,0.564559
electronic_check,0.452854,2365,0.182854,1.677237
mailed_check,0.191067,1612,-0.078933,0.707656





### Mutual Information

In [79]:
from sklearn.metrics import mutual_info_score

In [80]:
mutual_info_score(df.churn, df.contract)

0.09845305342598898

In [81]:
def compute_churn_mutual_info(series: pd.Series) -> float:
  return mutual_info_score(df.churn, series)

In [82]:
mutual_scores = df.select_dtypes(exclude='number').apply(compute_churn_mutual_info)
mutual_scores.sort_values(ascending=False)

contract            0.098453
onlinesecurity      0.064677
techsupport         0.063021
internetservice     0.055574
onlinebackup        0.046792
paymentmethod       0.044519
deviceprotection    0.043917
streamingmovies     0.032001
streamingtv         0.031908
paperlessbilling    0.019194
dependents          0.014467
partner             0.011454
multiplelines       0.000801
phoneservice        0.000072
gender              0.000037
dtype: float64

### Correlation

In [83]:
def compute_churn_correlation(series):
  return series.corrwith(df.churn)

In [84]:
corrs = df.select_dtypes('number').corrwith(df.churn)
corrs.sort_values(ascending=False)

churn             1.000000
monthlycharges    0.193356
seniorcitizen     0.150889
totalcharges     -0.198324
tenure           -0.352229
dtype: float64

In [85]:
df[df.tenure <= 2].churn.mean()

0.5835266821345708

In [86]:
df[(df.tenure > 2) & (df.tenure <= 12)].churn.mean()

0.4033232628398791

In [87]:
df[df.tenure > 12].churn.mean()

0.17129915585752523

In [88]:
df[df.monthlycharges < 20].churn.mean()

0.08972267536704731

In [89]:
df[(df.monthlycharges >= 20) & (df.monthlycharges < 50)].churn.mean()

0.182034503271862

In [90]:
df[df.monthlycharges > 50].churn.mean()

0.3178752107925801

## Feature Preparation

In [91]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [92]:
X_train.shape

(4225, 45)

In [93]:
def encode_categorical_vars(df: pd.DataFrame, encoder: OneHotEncoder = None) -> pd.DataFrame:
  df = df.copy()

  if not encoder:
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int64, handle_unknown='infrequent_if_exist')
    encoder.fit(df.select_dtypes(exclude='number'))

  encodings = encoder.transform(df.select_dtypes(exclude='number'))
  encodings_df = pd.DataFrame(encodings, columns=encoder.get_feature_names_out())

  full_df = df.select_dtypes('number').merge(encodings_df, left_index=True, right_index=True)


  return full_df, encoder

In [94]:
df_train, encoder = encode_categorical_vars(df_train)
df_val, _ = encode_categorical_vars(df_val, encoder)

In [95]:
print(df_train.shape)
print(df_val.shape)

(4225, 45)
(1409, 45)


## Logistic Regression

In [96]:
from sklearn.linear_model import LogisticRegression

In [97]:
model = LogisticRegression(max_iter=10000)
model.fit(df_train, y_train)

In [98]:
df_pred = pd.DataFrame({
  'probability': model.predict_proba(df_val)[:,1],
  'prediction': model.predict(df_val),
  'actual': y_val,
  'correct': y_val == model.predict(df_val)})

In [99]:
df_pred.correct.mean()

0.8034066713981547

## Model Evaluation

In [100]:
model.intercept_

array([-0.04667412])

In [101]:
dict(zip(model.feature_names_in_, model.coef_[0].round(3)))

{'seniorcitizen': 0.193,
 'tenure': -0.068,
 'monthlycharges': -0.018,
 'totalcharges': 0.0,
 'gender_female': 0.039,
 'gender_male': 0.0,
 'partner_no': -0.043,
 'partner_yes': 0.082,
 'dependents_no': 0.054,
 'dependents_yes': -0.015,
 'phoneservice_no': 0.068,
 'phoneservice_yes': -0.029,
 'multiplelines_no': -0.188,
 'multiplelines_no_phone_service': 0.068,
 'multiplelines_yes': 0.16,
 'internetservice_dsl': -0.495,
 'internetservice_fiber_optic': 0.694,
 'internetservice_no': -0.159,
 'onlinesecurity_no': 0.284,
 'onlinesecurity_no_internet_service': -0.159,
 'onlinesecurity_yes': -0.086,
 'onlinebackup_no': 0.116,
 'onlinebackup_no_internet_service': -0.159,
 'onlinebackup_yes': 0.082,
 'deviceprotection_no': 0.112,
 'deviceprotection_no_internet_service': -0.159,
 'deviceprotection_yes': 0.086,
 'techsupport_no': 0.234,
 'techsupport_no_internet_service': -0.159,
 'techsupport_yes': -0.036,
 'streamingtv_no': -0.055,
 'streamingtv_no_internet_service': -0.159,
 'streamingtv_yes'

## Using the model

In [102]:
df_train_full.dtypes

gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
dtype: object

In [103]:
df_train_full.select_dtypes(include='number')

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges
0,0,12,19.70,258.35
1,0,42,73.90,3160.55
2,0,71,65.15,4681.75
3,0,71,85.45,6300.85
4,0,30,70.40,2044.75
...,...,...,...,...
5629,1,9,100.50,918.60
5630,0,60,19.95,1189.90
5631,0,28,105.70,2979.50
5632,0,2,54.40,114.10


In [104]:
df_train_full, encoder = encode_categorical_vars(df_train_full)
df_test, _ = encode_categorical_vars(df_test, encoder)

In [105]:
model = LogisticRegression(max_iter=10000)
model.fit(df_train_full, y_train_full)

In [106]:
preds = model.predict(df_test)

In [107]:
(y_test == preds).mean()

0.8126330731014905

In [108]:
i = random.choice(range(df.shape[0]))
test_customer = df.drop('churn', axis=1).iloc[i].to_dict()

In [109]:
test_df = pd.DataFrame([test_customer])

In [110]:
test_df, _ = encode_categorical_vars(test_df, encoder=encoder)

In [111]:
model.predict(test_df)[0]

1

In [112]:
model.predict_proba(test_df)

array([[0.34048548, 0.65951452]])

In [113]:
y.iloc[i]

0