In [5]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/b3/f8/ee33e36194eb03a76eccf3adac3fba51f0e56fbd20609bb531659d48d3cb/lightgbm-4.1.0-py3-none-win_amd64.whl.metadata
  Using cached lightgbm-4.1.0-py3-none-win_amd64.whl.metadata (19 kB)
Using cached lightgbm-4.1.0-py3-none-win_amd64.whl (1.3 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.1.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")
     

In [3]:
df = pd.read_csv('customer-churn.csv')
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
df['Churn']=df['Churn'].map({'No':0,'Yes':1})

In [6]:
# Convert 'TotalCharges' to numeric values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Split the data into an 80-20 train-test split with a random state of 1
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Select features
categorical_values = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
               'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
               'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical_values = ['tenure', 'MonthlyCharges', 'TotalCharges']


In [11]:
numerical_preprocess = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Create a preprocessor for categorical features
categorical_preprocess = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse=False, drop='first'))
])

# Create a column transformer to apply the preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numerical_preprocess, numerical_values),
        ('categoric', categorical_preprocess, categorical_values)
    ])

# Create a final pipeline including preprocessing and the classifiers
randomforest_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=1))
])
    
extratree_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(random_state=1))
])


In [12]:
# Fit the models
randomforest_model.fit(X_train, y_train)
extratree_model.fit(X_train, y_train)


In [16]:
preprocessor.fit(X_train)  # Fit the preprocessor on the entire training dataset
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert the preprocessed data to int type for XGBoost
X_train_preprocessed = X_train_preprocessed.astype(int)
X_test_preprocessed = X_test_preprocessed.astype(int)

xgb_model = XGBClassifier(random_state=1)
lgbm_model = LGBMClassifier(random_state=1)

# Fit XGBoost and LightGBM models
xgb_model.fit(X_train_preprocessed, y_train)
lgbm_model.fit(X_train_preprocessed, y_train)

randomforest_predictions = randomforest_model.predict(X_test)
extratree_predictions = extratree_model.predict(X_test)
xgb_predictions = xgb_model.predict(X_test_preprocessed)
lgbm_predictions = lgbm_model.predict(X_test_preprocessed)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [17]:
from sklearn.metrics import accuracy_score
# Evaluate the models on the test set
randomforest_accuracy = accuracy_score(y_test, randomforest_predictions)
extratree_accuracy = accuracy_score(y_test, extratree_predictions)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
lgbm_accuracy = accuracy_score(y_test, lgbm_predictions)

# Print the accuracy of Random Forest, Extra Trees, XGBoost, and LightGBM
print("Random Forest Accuracy:", randomforest_accuracy)
print("Extra Trees Accuracy:", extratree_accuracy)
print("XGBoost Accuracy:", xgb_accuracy)
print("LightGBM Accuracy:", lgbm_accuracy)

Random Forest Accuracy: 0.8019872249822569
Extra Trees Accuracy: 0.78708303761533
XGBoost Accuracy: 0.7906316536550745
LightGBM Accuracy: 0.8041163946061036


In [26]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'classifier__n_estimators': n_estimators,
    'classifier__min_samples_split': min_samples_split,
    'classifier__min_samples_leaf': min_samples_leaf,
    'classifier__max_features': max_features
}

extratree_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(random_state=1))
])

randomized_search = RandomizedSearchCV(
    extratree_model,
    hyperparameter_grid,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=1
)

# Fit the RandomizedSearchCV to find the best hyperparameters
randomized_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", randomized_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'classifier__n_estimators': 50, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 8, 'classifier__max_features': None}
