In [70]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from IPython.display import display

In [71]:
#import csv
#use full location including raw.githubusercontent
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv", sep = ",")
#print(df.head())
print(df.head().T) # transposes, making rows columns


                                 0             1               2  \
customerID              7590-VHVEG    5575-GNVDE      3668-QPYBK   
gender                      Female          Male            Male   
SeniorCitizen                    0             0               0   
Partner                        Yes            No              No   
Dependents                      No            No              No   
tenure                           1            34               2   
PhoneService                    No           Yes             Yes   
MultipleLines     No phone service            No              No   
InternetService                DSL           DSL             DSL   
OnlineSecurity                  No           Yes             Yes   
OnlineBackup                   Yes            No             Yes   
DeviceProtection                No           Yes              No   
TechSupport                     No            No              No   
StreamingTV                     No            No

In [72]:
# cleaning
#df.dtypes
df.columns = df.columns.str.lower().str.replace(' ', '_') #making columns lower case and replacing space with underscore
#identify string columns
string_cols = df.select_dtypes(include='object').columns.tolist()
categorical_columns = string_cols
#print(string_cols)

for c in categorical_columns:
        df[c] = df[c].str.lower().str.replace(' ', '_')

# checking for null values in columns
# df[tc.isnull()][['customerid', 'totalcharges']]
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges.fillna(0) # should we set null values to 0?

# change yes no churn values to 0 and 1
df.churn = (df.churn == 'yes').astype(int)
print(df.head().T)


                                 0             1               2  \
customerid              7590-vhveg    5575-gnvde      3668-qpybk   
gender                      female          male            male   
seniorcitizen                    0             0               0   
partner                        yes            no              no   
dependents                      no            no              no   
tenure                           1            34               2   
phoneservice                    no           yes             yes   
multiplelines     no_phone_service            no              no   
internetservice                dsl           dsl             dsl   
onlinesecurity                  no           yes             yes   
onlinebackup                   yes            no             yes   
deviceprotection                no           yes              no   
techsupport                     no            no              no   
streamingtv                     no            no

In [73]:
#set up the validatin framework using scikit learn
df_train_full, df_test = train_test_split(df, test_size = 0.2, random_state = 1)
df_train, df_val = train_test_split(df_train_full, test_size = 0.33, random_state = 11)

y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train['churn']
del df_val['churn']

In [74]:
# exploring the data
#df_train_full.isnull().sum()
df_train_full.churn.value_counts()
global_mean = df_train_full.churn.mean()
round(global_mean, 2)

np.float64(0.27)

In [75]:
# list of string columns
categorical = pd.concat([df_train, df_val]).select_dtypes(include='object').columns.tolist()
numerical = pd.concat([df_train, df_val]).select_dtypes(include=['int64', 'float']).columns.tolist()
print (categorical)
print (numerical)

['customerid', 'gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']
['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges']


In [88]:
# profile string datasets

results = []
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    df_group['variable'] = col
    df_group = df_group.reset_index().rename(columns= {col:'category'})
    results.append(df_group)
    #display (df_group) #this shows individual records

 #combine all into one dataframe
df_summary = pd.concat(results, ignore_index = True)
df_summary[['variable', 'category', 'mean', 'diff', 'risk']]

Unnamed: 0,variable,category,mean,diff,risk
0,customerid,0002-orfbo,0.000000,-0.269968,0.000000
1,customerid,0004-tlhlj,1.000000,0.730032,3.704142
2,customerid,0011-igkff,1.000000,0.730032,3.704142
3,customerid,0013-exchz,1.000000,0.730032,3.704142
4,customerid,0013-mhzwf,0.000000,-0.269968,0.000000
...,...,...,...,...,...
5670,paperlessbilling,yes,0.338151,0.068183,1.252560
5671,paymentmethod,bank_transfer_(automatic),0.168171,-0.101797,0.622928
5672,paymentmethod,credit_card_(automatic),0.164339,-0.105630,0.608733
5673,paymentmethod,electronic_check,0.455890,0.185922,1.688682
