In [1]:
# Install H2O
!pip install h2o

# Import H2O and initialize the cluster
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O cluster
h2o.init()

Collecting h2o
  Downloading h2o-3.46.0.6.tar.gz (265.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.8/265.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.46.0.6-py2.py3-none-any.whl size=265859786 sha256=a96bc8e8f199071a1f9392f52d302e0933510e3618eaf9724ced24e3b9453f63
  Stored in directory: /root/.cache/pip/wheels/62/f9/aa/687bd54342d2981bc78e22ee9b9bc39f92006e344e7aa1e0ac
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.46.0.6
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.26" 2025-01-21; OpenJDK Runtime Environment (build 11.0.26+4-post-Ubuntu-1ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.26+4

0,1
H2O_cluster_uptime:,07 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,3 months and 8 days
H2O_cluster_name:,H2O_from_python_unknownUser_25bx32
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [18]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [19]:
# Preprocessing
# Drop customerID as it's not useful for prediction
df.drop(columns=['customerID'], inplace=True)

# Convert TotalCharges to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with NaN values
df.dropna(inplace=True)

# Convert categorical variables to dummy/indicator variables
df = pd.get_dummies(df, drop_first=True)

# Separate features and target
X = df.drop(columns=['Churn_Yes'])
y = df['Churn_Yes']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine X_train and y_train into a single DataFrame for H2O
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [20]:
# Convert pandas DataFrames to H2OFrames
train_h2o = h2o.H2OFrame(train_df)
test_h2o = h2o.H2OFrame(test_df)

# Set the target column as a factor (categorical variable)
train_h2o['Churn_Yes'] = train_h2o['Churn_Yes'].asfactor()
test_h2o['Churn_Yes'] = test_h2o['Churn_Yes'].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [21]:
# Define the target and feature columns
y = "Churn_Yes"
X = train_h2o.columns
X.remove(y)  # Remove the target column

# Initialize H2O AutoML
aml = H2OAutoML(
    max_runtime_secs=900,  # 15 minutes
    seed=42,
    balance_classes=True,  # Balance classes for imbalanced dataset
    project_name="telco_churn"
)

# Train the model
aml.train(x=X, y=y, training_frame=train_h2o)

# View the leaderboard
lb = aml.leaderboard
print(lb)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                      auc    logloss     aucpr    mean_per_class_error      rmse       mse
StackedEnsemble_BestOfFamily_4_AutoML_1_20250210_165114  0.852397   0.40861   0.671313                0.229083  0.363972  0.132476
StackedEnsemble_BestOfFamily_2_AutoML_1_20250210_165114  0.851873   0.409213  0.671848                0.235252  0.364062  0.132541
GBM_grid_1_AutoML_1_20250210_165114_model_14             0.851682   0.409703  0.669959                0.236072  0.364491  0.132854
StackedEnsemble_AllModels_3_AutoML_1_20250210_165114     0.851623   0.409179  0.670475                0.234021  0.364382  0.132774
StackedEnsemble_BestOfFamily_3_AutoML_1_20250210_165114  0.851615   0.409403  0.671309                0.233689  0.364252  0.13268
StackedEnsemble_BestOfFamily_1_AutoML_1_20250210_165114  0.851592   0.409453  0.670714                0.235194  0.364193

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

# View the leaderboard
lb = aml.leaderboard
print(lb)

# Get the best model
best_model = aml.leader

# Predict on the test set
preds = best_model.predict(test_h2o)

# Extract predictions (as a pandas DataFrame for easier manipulation)
preds_df = preds.as_data_frame()

# Confusion Matrix using sklearn
y_true = test_h2o['Churn_Yes'].as_data_frame().values
y_pred = preds_df['predict'].values

conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# F1 Score
f1 = f1_score(y_true, y_pred)
print("F1 Score:", f1)


model_id                                                      auc    logloss     aucpr    mean_per_class_error      rmse       mse
StackedEnsemble_BestOfFamily_4_AutoML_1_20250210_165114  0.852397   0.40861   0.671313                0.229083  0.363972  0.132476
StackedEnsemble_BestOfFamily_2_AutoML_1_20250210_165114  0.851873   0.409213  0.671848                0.235252  0.364062  0.132541
GBM_grid_1_AutoML_1_20250210_165114_model_14             0.851682   0.409703  0.669959                0.236072  0.364491  0.132854
StackedEnsemble_AllModels_3_AutoML_1_20250210_165114     0.851623   0.409179  0.670475                0.234021  0.364382  0.132774
StackedEnsemble_BestOfFamily_3_AutoML_1_20250210_165114  0.851615   0.409403  0.671309                0.233689  0.364252  0.13268
StackedEnsemble_BestOfFamily_1_AutoML_1_20250210_165114  0.851592   0.409453  0.670714                0.235194  0.364193  0.132636
StackedEnsemble_AllModels_4_AutoML_1_20250210_165114     0.851247   0.40965   0.6705





In [28]:
# Save the model
model_path = h2o.save_model(best_model, path="/content", force=True)
print("Model saved to:", model_path)

# Load the model (if needed)
# loaded_model = h2o.load_model(model_path)

Model saved to: /content/StackedEnsemble_BestOfFamily_4_AutoML_1_20250210_165114


In [29]:
# Shutdown H2O cluster
h2o.cluster().shutdown()

H2O session _sid_8ad7 closed.
