In [2]:
library(tidyverse)
library(tidymodels)
library(h2o)

In [3]:
df <- read_csv('telco_customer_churn.csv', col_types = cols(customerID = col_skip()))
df <- df %>% 
        mutate_if(is.character, as_factor) 
       #%>% mutate(Churn = Churn %>% fct_relevel('Yes'))
        
glimpse(df)

Observations: 7,043
Variables: 20
$ gender           <fct> Female, Male, Male, Male, Female, Female, Male, Fema…
$ SeniorCitizen    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ Partner          <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes, No, Y…
$ Dependents       <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes, No, N…
$ tenure           <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, …
$ PhoneService     <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes, …
$ MultipleLines    <fct> No phone service, No, No, No phone service, No, Yes,…
$ InternetService  <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic, Fiber …
$ OnlineSecurity   <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Yes, No…
$ OnlineBackup     <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, No, No i…
$ DeviceProtection <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, No, No i…
$ TechSupport      <fct> No, No, No, Yes, No, No, No, No, Yes, No, No, No int…
$ StreamingTV     

In [4]:
split_obj <- rsample::initial_split(df, prop = 0.8, strata = 'Churn')

train = rsample::training(split_obj)
test = rsample::testing(split_obj)

In [5]:
# We get vector of columns to drop from EDA done before.
drop_cols_2 <- readRDS('drop_cols_2.rds')

In [6]:
recipe_obj <- df %>% 
    recipe(Churn ~ ., data = df) %>%
    step_knnimpute(TotalCharges) %>% 
    step_YeoJohnson(all_numeric()) %>%
    step_center(all_numeric()) %>% 
    step_scale(all_numeric()) %>%
    step_mutate(SeniorCitizen = SeniorCitizen %>% as.numeric()) %>% 
    step_dummy(all_nominal(), -Churn, one_hot = T) %>% 
    step_rm(drop_cols_2)

In [7]:
train_processed <- recipe_obj %>% prep() %>% bake(train)
test_processed <- recipe_obj %>% prep() %>% bake(test)

train_processed %>% glimpse()

Observations: 5,636
Variables: 22
$ SeniorCitizen                           <dbl> -0.4398853, -0.4398853, -0.43…
$ tenure                                  <dbl> -1.64422287, 0.29718222, 0.64…
$ Churn                                   <fct> No, No, No, Yes, No, No, Yes,…
$ gender_Male                             <dbl> 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,…
$ Partner_Yes                             <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,…
$ Dependents_Yes                          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,…
$ PhoneService_Yes                        <dbl> 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,…
$ MultipleLines_Yes                       <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,…
$ InternetService_DSL                     <dbl> 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,…
$ InternetService_Fiber.optic             <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,…
$ OnlineSecurity_No                       <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,…
$ OnlineBackup_No                         <dbl> 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,…
$ DeviceProtection

In [9]:
h2o.init()


H2O is not running yet, starting it now...

Note:  In case of errors look at the following log files:
    /tmp/RtmpVB1C0x/h2o_arun_started_from_r.out
    /tmp/RtmpVB1C0x/h2o_arun_started_from_r.err


Starting H2O JVM and connecting: .. Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         4 seconds 36 milliseconds 
    H2O cluster timezone:       Asia/Kolkata 
    H2O data parsing timezone:  UTC 
    H2O cluster version:        3.28.0.2 
    H2O cluster version age:    2 months and 11 days  
    H2O cluster name:           H2O_started_from_R_arun_pag359 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   3.45 GB 
    H2O cluster total cores:    4 
    H2O cluster allowed cores:  4 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         Amazon S3, XGBoost, Algos, 

In [11]:
train_h2o <- as.h2o(train_processed)
test_h2o <- as.h2o(test_processed)



In [12]:
y <- "Churn"
x <- setdiff(names(train_h2o), y)

In [None]:
aml <- h2o.automl(x = x, y = y,
                  training_frame = train_h2o,
                  max_models = 20,
                  seed = 123)

