In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

### Data Preparation

In [None]:
df = pd.read_csv('data-week-3.csv')
df.head()

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [None]:
df.head().T

In [None]:
tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [None]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [None]:
df.totalcharges = df.totalcharges.fillna(0)

In [None]:
df.churn = (df.churn == 'yes').astype(int)

### Setting up the validation Framework
- Perform the train/validation/test split with Scikit-Learn

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [None]:
len(df_full_train), len(df_test)

In [None]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [None]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

### EDA
- Check missing values
- Look at the target variable(churn)
- Look at numerical and categorical variables

In [None]:
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
df_full_train.churn.value_counts(normalize=True)

In [None]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

In [None]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [None]:
df_full_train.columns

In [None]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [None]:
df_full_train[categorical].nunique()

  ### Churn Rate

In [None]:
df_full_train.head()

In [None]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female 

In [None]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male 

In [None]:
global_churn = df_full_train.churn.mean()
global_churn

In [None]:
df_full_train.partner.value_counts()

In [None]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner

### Difference

In [None]:
global_churn - churn_partner

In [None]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

In [None]:
global_churn - churn_no_partner

### Risk Ratio

In [None]:
churn_no_partner / global_churn

In [None]:
churn_partner / global_churn

In [None]:
from IPython.display import display

In [None]:
for c in categorical:
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)
    print()
    print()

### Feature Importance: Mutual Information

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

In [None]:
mutual_info_score(df_full_train.gender, df_full_train.churn)

In [None]:
mutual_info_score(df_full_train.partner, df_full_train.churn)

In [None]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [None]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

### Feature Importance: Correlation

In [None]:
df_full_train.tenure.max()

In [None]:
df_full_train[numerical]