# Classification

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import is_object_dtype
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('customer-churn.csv')

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data Cleaning

In [5]:
def standardise_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df = df.copy()
  df.columns = df.columns.str.lower().str.replace(' ', '_')
  return df

In [6]:
def standardise_str_cols(df: pd.DataFrame) -> pd.DataFrame:
  df = df.copy()

  for col in df.columns:
    if is_object_dtype(df[col]):
      df[col] = df[col].str.lower().str.replace(' ', '_')
  
  return df

In [7]:
df = standardise_colnames(df)
df = standardise_str_cols(df)

In [8]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce').fillna(0)
df.churn = (df.churn == 'yes').astype('int')

In [None]:
df.drop('customerid', axis=1, inplace=True)

## Validation Framework

In [9]:
y = df.churn

In [10]:
df_train_full, df_test, y_train_full, y_test = train_test_split(
  df.drop('churn', axis=1), y, test_size=.2, random_state=1)
df_train, df_val, y_train, y_val = train_test_split(
  df_train_full, y_train_full, test_size=.25, random_state=1)

In [11]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(4225, 20)
(1409, 20)
(1409, 20)


In [12]:
df_train_full.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

## Exploratory Data Analysis

In [13]:
df.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [14]:
df.churn.value_counts(normalize=True)

churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64

In [15]:
global_churn_rate = round(df.churn.mean(), 2)
global_churn_rate

0.27

In [16]:
numeric_vars = df_train_full.select_dtypes('number').columns
categorical_vars = df_train_full.select_dtypes(exclude='number').columns

In [17]:
df.select_dtypes(exclude='number').nunique()

customerid          7043
gender                 2
partner                2
dependents             2
phoneservice           2
multiplelines          3
internetservice        3
onlinesecurity         3
onlinebackup           3
deviceprotection       3
techsupport            3
streamingtv            3
streamingmovies        3
contract               3
paperlessbilling       2
paymentmethod          4
dtype: int64

### Feature Importance
#### Churn Rate

In [18]:
df.groupby(['gender']).churn.agg(['mean', 'count']) / global_churn_rate # risk ratio

Unnamed: 0_level_0,mean,count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.997069,12918.518519
male,0.968901,13166.666667


In [19]:
df.groupby(['partner']).churn.agg(['mean', 'count']) / global_churn_rate # risk ratio

Unnamed: 0_level_0,mean,count
partner,Unnamed: 1_level_1,Unnamed: 2_level_1
no,1.220666,13485.185185
yes,0.72833,12600.0


In [20]:
from IPython.display import display

In [21]:
for c in categorical_vars:
  df_group = df.groupby(c).churn.agg(['mean', 'count'])
  df_group['diff'] = df_group['mean'] - global_churn_rate
  df_group['risk'] = df_group['mean'] / global_churn_rate
  display(df_group)
  print()

Unnamed: 0_level_0,mean,count,diff,risk
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0002-orfbo,0.0,1,-0.27,0.000000
0003-mknfe,0.0,1,-0.27,0.000000
0004-tlhlj,1.0,1,0.73,3.703704
0011-igkff,1.0,1,0.73,3.703704
0013-exchz,1.0,1,0.73,3.703704
...,...,...,...,...
9987-lutyd,0.0,1,-0.27,0.000000
9992-rramn,1.0,1,0.73,3.703704
9992-ujoel,0.0,1,-0.27,0.000000
9993-lhieb,0.0,1,-0.27,0.000000





Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.269209,3488,-0.000791,0.997069
male,0.261603,3555,-0.008397,0.968901





Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.32958,3641,0.05958,1.220666
yes,0.196649,3402,-0.073351,0.72833





Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.312791,4933,0.042791,1.158487
yes,0.154502,2110,-0.115498,0.572231





Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.249267,682,-0.020733,0.923211
yes,0.267096,6361,-0.002904,0.989246





Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.250442,3390,-0.019558,0.927565
no_phone_service,0.249267,682,-0.020733,0.923211
yes,0.286099,2971,0.016099,1.059626





Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.189591,2421,-0.080409,0.702189
fiber_optic,0.418928,3096,0.148928,1.551584
no,0.07405,1526,-0.19595,0.274259





Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.417667,3498,0.147667,1.546916
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.146112,2019,-0.123888,0.541155





Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.399288,3088,0.129288,1.478843
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.215315,2429,-0.054685,0.797463





Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.391276,3095,0.121276,1.449171
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.225021,2422,-0.044979,0.83341





Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.416355,3473,0.146355,1.542055
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.151663,2044,-0.118337,0.561716





Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.335231,2810,0.065231,1.241597
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.300702,2707,0.030702,1.113711





Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.336804,2785,0.066804,1.247423
no_internet_service,0.07405,1526,-0.19595,0.274259
yes,0.299414,2732,0.029414,1.108942





Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.427097,3875,0.157097,1.58184
one_year,0.112695,1473,-0.157305,0.41739
two_year,0.028319,1695,-0.241681,0.104884





Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.163301,2872,-0.106699,0.604818
yes,0.335651,4171,0.065651,1.243152





Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.167098,1544,-0.102902,0.618883
credit_card_(automatic),0.152431,1522,-0.117569,0.564559
electronic_check,0.452854,2365,0.182854,1.677237
mailed_check,0.191067,1612,-0.078933,0.707656





### Mutual Information

In [22]:
from sklearn.metrics import mutual_info_score

In [24]:
mutual_info_score(df.churn, df.contract)

0.09845305342598898

In [28]:
def compute_churn_mutual_info(series: pd.Series) -> float:
  return mutual_info_score(df.churn, series)

In [32]:
mutual_scores = df.select_dtypes(exclude='number').apply(compute_churn_mutual_info)
mutual_scores.sort_values(ascending=False)

customerid          0.578599
contract            0.098453
onlinesecurity      0.064677
techsupport         0.063021
internetservice     0.055574
onlinebackup        0.046792
paymentmethod       0.044519
deviceprotection    0.043917
streamingmovies     0.032001
streamingtv         0.031908
paperlessbilling    0.019194
dependents          0.014467
partner             0.011454
multiplelines       0.000801
phoneservice        0.000072
gender              0.000037
dtype: float64

### Correlation