In [119]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as split


%matplotlib inline

def norm_cols(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df


def norm_values(df):
    str_cols = list(df.dtypes[df.dtypes == 'object'].index)
    for col in str_cols:
        df[col] = df[col].str.lower().str.replace(' ', '_')
    return df


def make_binary(df, col, val='yes'):
    df[col] = (df[col] == val).astype(int)
    return df

In [120]:
df = pd.read_csv('../../data/churn/churndata.csv')

In [121]:
print(len(df))
df.head()
df.head().T
df.dtypes

7043


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [122]:
total_charges = pd.to_numeric(df.TotalCharges, errors='coerce')
df[total_charges.isnull()][['customerID', 'TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [123]:
# Normalize everything
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)
df = norm_cols(df)
df = norm_values(df)
df = make_binary(df, 'churn')

In [124]:
# Create data sets
df_train_full, df_test = split(df, test_size=0.20, random_state=1)
df_train, df_val = split(df_train_full, test_size=0.22, random_state=11)

y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train['churn']
del df_val['churn']

In [129]:
# Analyze the data
# Find missing values
df_train_full.isnull().sum()

# Check prediction feature distro
df_train_full.churn.value_counts()

# Check churn rate
global_mean = df_train_full.churn.mean()
round(global_mean, 3)

# Split column types
numerical = ['tenure','monthlycharges','totalcharges']
ignored = ['customerid'] +  numerical
categorical = [col for col in df_train.columns if col not in ignored]

# See number of categorical values
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64