In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score , roc_auc_score ,make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,plot_confusion_matrix,classification_report

# Importing Data
Teleco Churn Dataset allows us to predict will stop using Telecom services or not

In [None]:
df = pd.read_excel('/kaggle/input/telco-customer-churn-ibm-dataset/Telco_customer_churn.xlsx')

# Data Exploration & Cleaning

In [None]:
df.head()

In [None]:
df.drop(['Churn Label' , 'Churn Score' , 'CLTV' , 'Churn Reason'] , axis =1 , inplace=True)

In [None]:
df['Count'].unique()

In [None]:
df['Country'].unique()

In [None]:
df['State'].unique()

In [None]:
df['City'].unique()

In [None]:
df.drop(['CustomerID' ,'Count' , 'Country' , 'State' , 'Lat Long'] , axis=1 , inplace = True)

In [None]:
df['City'].replace(' ','_' , regex=True , inplace=True)

In [None]:
df.columns=df.columns.str.replace(' ','_')

In [None]:
df.info()

In [None]:
df['Phone_Service'].unique()

In [None]:
df['Total_Charges'].value_counts()

In [None]:
df.loc[df['Total_Charges']==' ']

In [None]:
df.loc[(df['Total_Charges']==' '),'Total_Charges']='0'

In [None]:
df['Total_Charges']=pd.to_numeric(df['Total_Charges'])

In [None]:
df.replace(' ','_' ,regex=True , inplace=True)
df.head()

In [None]:
df.info()

In [None]:
X=df.drop('Churn_Value' ,axis=1)
Y=df['Churn_Value']

In [None]:
X_encoded=pd.get_dummies(X ,columns=['City', 'Gender', 'Senior_Citizen',
       'Partner', 'Dependents', 'Phone_Service',
       'Multiple_Lines', 'Internet_Service', 'Online_Security',
       'Online_Backup', 'Device_Protection', 'Tech_Support', 'Streaming_TV',
       'Streaming_Movies', 'Contract', 'Paperless_Billing', 'Payment_Method',],)

In [None]:
Y.value_counts()

# Preparing the Model

In [None]:
X_train , X_test, Y_train , Y_test = train_test_split(X_encoded , Y ,test_size=0.25, random_state=42 , stratify=Y)

In [None]:
clf_xgb=xgb.XGBClassifier(objective='binary:logistic' ,seed=42)
clf_xgb.fit(X_train , Y_train ,verbose=True , early_stopping_rounds=15 , eval_metric='aucpr' , eval_set=[(X_test ,Y_test)])

In [None]:
plot_confusion_matrix(clf_xgb,X_test ,Y_test ,values_format='d' ,display_labels=["Didnt Leave " ,"Left"])

# Hyper Parameter Tuning Using CV

In [None]:
param_grid={
    'max_depth':[3,4,5],
    'learning_rate':[0.1,0.01 ,0.05],
    'gamma':[0,0.25 ,1.0],
    'reg_lambda':[0,1.0,10],
    'scale_pos_weight':[1,3,5]
}

In [None]:
optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic' ,
                                sub_sample=0.9 ,
                                colsample_bytree=0.5,seed=42),
    param_grid=param_grid,
    scoring='roc_auc',
    verbose=0,
    n_jobs=10,
    cv=3
)

In [None]:
optimal_params.fit(X_train , Y_train , early_stopping_rounds=10 , eval_metric='auc',eval_set=[(X_test , Y_test)] ,verbose=False )

In [None]:
print(optimal_params.best_params_)

In [None]:
clf_xgb=xgb.XGBClassifier(
    objective='binary:logistic' ,
    gamma=0.25 ,
    learn_rate=0.1 ,
    max_depth=4 , 
    reg_lambda=10 , 
    scale_pos_weight=2,
    sub_sample=0.9 ,
    colsample_bytree=0.5,
    seed=42)
clf_xgb.fit(X_train ,
            Y_train ,
            verbose=False ,
            early_stopping_rounds=10 ,
            eval_metric='aucpr' ,
            eval_set=[(X_test ,Y_test)])

In [None]:
plot_confusion_matrix(clf_xgb,X_test ,Y_test ,values_format='d' ,display_labels=["Didnt Leave " ,"Left"])

In [None]:
print(classification_report(Y_test , clf_xgb.predict(X_test)))