In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Drop the 'customerID' column
df.drop('customerID', axis=1, inplace=True)

# Convert 'TotalCharges' to numeric and handle errors
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Replace 'No phone service' and 'No internet service' with 'No'
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')
internet_columns = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in internet_columns:
    df[col] = df[col].replace('No internet service', 'No')

# Replace 'Yes'/'No' with 1/0 for binary columns
def replace_yes_no(df, columns):
    for col in columns:
        df[col] = df[col].replace({'Yes': 1, 'No': 0})

columns_to_replace = [
    "Partner", "Dependents", "PhoneService", "OnlineSecurity", "OnlineBackup",
    "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
    "PaperlessBilling", "Churn"
]

replace_yes_no(df, columns_to_replace)

# Label encode 'gender' column
binary_columns = ['gender']
le = LabelEncoder()
df[binary_columns] = df[binary_columns].apply(le.fit_transform)

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod'])

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Define model columns
model_columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No', 'MultipleLines_No phone service', 
       'MultipleLines_Yes', 'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_No internet service', 
       'OnlineSecurity_Yes', 'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No', 'DeviceProtection_No internet service', 
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_No internet service', 
       'TechSupport_Yes', 'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year', 'PaperlessBilling_No', 
       'PaperlessBilling_Yes', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']

# Ensure all required columns are present
for col in model_columns:
    if col not in df.columns:
        df[col] = 0  # Add missing columns with a value of 0

# Reorder columns to match the model columns
df = df[model_columns + ['Churn']]  # Add the target column 'Churn' at the end

# Split data into features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Initialize and train the CatBoost model
catboost_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=0)
catboost_model.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred = catboost_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of CatBoost Model after SMOTE:", accuracy)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)
  df[col] = df[col].replace({'Yes': 1, 'No': 0})


Accuracy of CatBoost Model after SMOTE: 0.8371980676328502


In [5]:
import joblib

# Save the trained model using joblib
joblib.dump(catboost_model, 'catboost_model.pkl')


['catboost_model.pkl']