In [1]:
!pip install tpot

Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deap-1.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Building wheel

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from tpot import TPOTClassifier

In [3]:
# Load the dataset
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# Preprocessing
# Drop customerID as it's not useful for prediction
df.drop(columns=['customerID'], inplace=True)

# Convert TotalCharges to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with NaN values
df.dropna(inplace=True)

# Convert categorical variables to dummy/indicator variables
df = pd.get_dummies(df, drop_first=True)

# Separate features and target
X = df.drop(columns=['Churn_Yes'])
y = df['Churn_Yes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Initialize TPOT
tpot = TPOTClassifier(
    generations=5,  # Number of iterations to run the pipeline optimization
    population_size=20,  # Number of individuals in the population
    verbosity=2,  # Show progress
    random_state=42,  # Random seed for reproducibility
    scoring='f1',  # Use F1 score as the evaluation metric
    n_jobs=-1,  # Use all available CPU cores
    max_time_mins=15,  # Set training time to 15 minutes
)

# Fit the model
tpot.fit(X_train, y_train)

# Export the best pipeline (optional)
tpot.export('tpot_telco_churn_pipeline.py')

is_classifier
is_regressor
is_classifier




is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier


Optimization Progress:   0%|          | 0/20 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.6067491003938678

Generation 2 - Current best internal CV score: 0.6067491003938678

Generation 3 - Current best internal CV score: 0.6067491003938678

Generation 4 - Current best internal CV score: 0.6067491003938678

Generation 5 - Current best internal CV score: 0.6087867516749095

Best pipeline: BernoulliNB(SGDClassifier(FastICA(LinearSVC(input_matrix, C=0.5, dual=True, loss=hinge, penalty=l2, tol=1e-05), tol=0.30000000000000004), alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.5, learning_rate=invscaling, loss=perceptron, penalty=elasticnet, power_t=0.1), alpha=0.01, fit_prior=False)


In [8]:
# Predict on the test set
y_pred = tpot.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# F1 Score
f1 = f1_score(y_test, y_pred, pos_label=1)
print("F1 Score:", f1)

Confusion Matrix:
[[734 299]
 [ 87 287]]
F1 Score: 0.5979166666666667




In [9]:
import joblib

# Save the model
joblib.dump(tpot.fitted_pipeline_, 'tpot_telco_churn_model.pkl')

# Load the model (if needed)
# tpot_model = joblib.load('tpot_telco_churn_model.pkl')

['tpot_telco_churn_model.pkl']