In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, \
f1_score, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.combine import SMOTEENN
from sklearn.model_selection import RandomizedSearchCV

In [37]:
df = pd.read_csv("data-ready.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,1,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,2,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,3,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,4,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [38]:
df = df.drop('Unnamed: 0', axis = 1)

In [39]:
df.sample(10)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2778,Female,0,Yes,No,43,Yes,Yes,Fiber optic,Yes,Yes,No,No,No,Yes,Month-to-month,Yes,Electronic check,94.1,4107.3,No
3761,Male,0,Yes,No,72,Yes,Yes,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),110.9,8240.85,No
6478,Female,0,Yes,No,61,Yes,Yes,Fiber optic,No,Yes,No,Yes,No,Yes,Two year,Yes,Bank transfer (automatic),94.35,5703.0,No
3145,Male,0,No,No,17,No,No,DSL,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,25.65,440.2,No
4545,Male,0,Yes,No,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),115.8,8424.9,No
5490,Male,1,No,No,15,Yes,No,Fiber optic,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,79.4,1156.1,Yes
1034,Female,0,No,No,13,No,No,DSL,No,No,Yes,No,Yes,Yes,Month-to-month,No,Electronic check,49.15,649.4,No
3440,Female,0,No,No,58,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,Yes,Yes,One year,Yes,Electronic check,107.75,6332.75,No
3092,Male,0,No,Yes,2,Yes,No,No,No,No,No,No,No,No,Month-to-month,No,Mailed check,20.4,42.9,No
4356,Female,0,Yes,No,56,Yes,Yes,Fiber optic,No,No,No,No,Yes,No,Month-to-month,Yes,Bank transfer (automatic),85.6,4902.8,No


### Convert Yes and No to 1 or 0

In [40]:
yes_no_columns = ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
                  'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
for col in yes_no_columns:
    df[col].replace({'Yes': 1,'No': 0},inplace=True)

In [41]:
for col in df:
    print(f'{col}: {df[col].unique()}') 

gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
PhoneService: [0 1]
MultipleLines: [0 1]
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: [1 0]
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]
Churn: [0 1]


In [42]:
df['gender'].replace({'Female':1,'Male':0},inplace=True)

In [43]:
df.gender.unique()

array([1, 0], dtype=int64)

### One hot encoding for categorical columns

In [44]:
# Make sure to add the parameter 'drop_first' to escape the dummy variable trap
df = pd.get_dummies(data=df, columns=['InternetService','Contract','PaymentMethod'], drop_first = True)
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'InternetService_Fiber optic', 'InternetService_No',
       'Contract_One year', 'Contract_Two year',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [45]:
df.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
6805,0,0,0,0,1,1,0,0,0,0,...,70.3,70.3,1,1,0,0,0,0,1,0
2263,1,0,0,0,40,1,0,0,0,0,...,20.6,827.3,0,0,1,0,1,0,0,0
6431,1,0,0,0,32,1,1,0,0,1,...,101.35,3334.9,0,1,0,0,0,0,1,0
4163,0,0,1,0,72,1,1,1,1,1,...,115.15,8349.45,0,1,0,0,1,0,0,0
6537,0,0,0,0,24,1,0,1,0,0,...,51.15,1275.7,0,0,0,0,0,0,1,0


In [46]:
df.dtypes

gender                                     int64
SeniorCitizen                              int64
Partner                                    int64
Dependents                                 int64
tenure                                     int64
PhoneService                               int64
MultipleLines                              int64
OnlineSecurity                             int64
OnlineBackup                               int64
DeviceProtection                           int64
TechSupport                                int64
StreamingTV                                int64
StreamingMovies                            int64
PaperlessBilling                           int64
MonthlyCharges                           float64
TotalCharges                             float64
Churn                                      int64
InternetService_Fiber optic                uint8
InternetService_No                         uint8
Contract_One year                          uint8
Contract_Two year   

### Splitting the data into input and output

In [47]:
X = df.drop('Churn',axis='columns')
y = df['Churn']

### As the dataset is highly unbalanced (There's a significant difference in the number of samples for each class.) as shown below, we need to handle this issue before starting training the model.

In [48]:
len(y)

7032

In [49]:
y.value_counts()

0    5163
1    1869
Name: Churn, dtype: int64

In [50]:
y.value_counts()/len(y)

0    0.734215
1    0.265785
Name: Churn, dtype: float64

In [82]:
sm = SMOTEENN(sampling_strategy=0.95, random_state = 0)
X_resampled, y_resampled = sm.fit_resample(X,y)

In [83]:
len(y_resampled)

5530

### It seems like the SMOTEENN carried out down sampling on the data

In [84]:
y_resampled.value_counts()

1    2857
0    2673
Name: Churn, dtype: int64

In [85]:
y_resampled.value_counts()/len(y_resampled)

1    0.516637
0    0.483363
Name: Churn, dtype: float64

In [86]:
X_train, X_test, y_train ,y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state = 0, stratify=y_resampled)

In [87]:
y_train.value_counts()

1    2286
0    2138
Name: Churn, dtype: int64

In [91]:
y_train.value_counts()/len(y_train)

1    0.516727
0    0.483273
Name: Churn, dtype: float64

In [92]:
y_test.value_counts()

1    571
0    535
Name: Churn, dtype: int64

In [93]:
y_test.value_counts()/len(y_test)

1    0.516275
0    0.483725
Name: Churn, dtype: float64

In [94]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state = 0),
    "Random Forest": RandomForestClassifier(random_state = 0),
    "XG Boost": XGBClassifier(random_state = 0)
}

In [95]:
for i in range(len(list(models))):
    # Train models
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred =  model.predict(X_test)
    
    # Training set performance
    model_train_accuracy_score = accuracy_score(y_train, y_train_pred)
    model_train_f1_score = f1_score(y_train, y_train_pred, average = 'weighted')
    model_train_precision_score = precision_score(y_train, y_train_pred)
    model_train_recall_score = recall_score(y_train, y_train_pred)
    model_train_rocaus_score = roc_auc_score(y_train, y_train_pred)

    
    # Testing set performance
    model_test_accuracy_score = accuracy_score(y_test, y_test_pred)
    model_test_f1_score = f1_score(y_test, y_test_pred, average = 'weighted')
    model_test_precision_score = precision_score(y_test, y_test_pred)
    model_test_recall_score = recall_score(y_test, y_test_pred)
    model_test_rocaus_score = roc_auc_score(y_test, y_test_pred)
    
    print('**', list(models.keys())[i])
    
    print('Model performance for training set')
    print('- Accuracy: {:.4}'.format(model_train_accuracy_score))
    print('- F1 score: {:.4}'.format(model_train_f1_score))
    print('- Precision: {:.4}'.format(model_train_precision_score))
    print('- Recall: {:.4}'.format(model_train_recall_score))
    print('- ROC AUC score: {:.4}'.format(model_train_rocaus_score))
    
    print('---------------------------------------')
    
    print('Model performance for test set')
    print('- Accuracy: {:.4}'.format(model_test_accuracy_score))
    print('- F1 score: {:.4}'.format(model_test_f1_score))
    print('- Precision: {:.4}'.format(model_test_precision_score))
    print('- Recall: {:.4}'.format(model_test_recall_score))
    print('- ROC AUC score: {:.4}'.format(model_test_rocaus_score))
    
    print('='*35)
    print('\n')

** Decision Tree
Model performance for training set
- Accuracy: 1.0
- F1 score: 1.0
- Precision: 1.0
- Recall: 1.0
- ROC AUC score: 1.0
---------------------------------------
Model performance for test set
- Accuracy: 0.9304
- F1 score: 0.9303
- Precision: 0.9215
- Recall: 0.9457
- ROC AUC score: 0.9299


** Random Forest
Model performance for training set
- Accuracy: 1.0
- F1 score: 1.0
- Precision: 1.0
- Recall: 1.0
- ROC AUC score: 1.0
---------------------------------------
Model performance for test set
- Accuracy: 0.9421
- F1 score: 0.9421
- Precision: 0.944
- Recall: 0.944
- ROC AUC score: 0.9421


** XG Boost
Model performance for training set
- Accuracy: 0.9995
- F1 score: 0.9995
- Precision: 0.9996
- Recall: 0.9996
- ROC AUC score: 0.9995
---------------------------------------
Model performance for test set
- Accuracy: 0.9512
- F1 score: 0.9512
- Precision: 0.9465
- Recall: 0.9597
- ROC AUC score: 0.9509




#### With regard to the test set performance it is clear that the XG Boost algorithm performed best. We will take it further to perform hyperparameter tuning to get optimized performance.

### Hyperparameters Tuning

In [96]:
params = { 'objective': ['binary:logistic'],
           'max_depth': [4, 5, 6, 7],
           'learning_rate': [0.1, 0.2, 0.3],
           'subsample': np.arange(0.865, 0.875, 0.001),
           'n_estimators': [554],
           'reg_lambda':[1, 2, 3],
           'scale_pos_weight': [2, 3, 4]
         }
# 'learning_rate': np.arange(0.315, 0.32, 0.001)
print(params)

{'objective': ['binary:logistic'], 'max_depth': [4, 5, 6, 7], 'learning_rate': [0.1, 0.2, 0.3], 'subsample': array([0.865, 0.866, 0.867, 0.868, 0.869, 0.87 , 0.871, 0.872, 0.873,
       0.874, 0.875]), 'n_estimators': [554], 'reg_lambda': [1, 2, 3], 'scale_pos_weight': [2, 3, 4]}


In [97]:
xgb_RandomGrid = RandomizedSearchCV(XGBClassifier(), param_distributions = params, n_iter = 5, cv = 10, verbose=2, n_jobs = -1)

In [98]:
xgb_RandomGrid.fit(X_resampled, y_resampled)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


In [99]:
xgb_RandomGrid.best_score_

0.957866184448463

In [102]:
xgb_RandomGrid.best_params_

{'subsample': 0.865,
 'scale_pos_weight': 2,
 'reg_lambda': 1,
 'objective': 'binary:logistic',
 'n_estimators': 554,
 'max_depth': 4,
 'learning_rate': 0.3}

In [101]:
xgb_RandomGrid.best_estimator_

### Befor we make predictions on out of sample data it's critical to train the model with the best known parameters using all of the data. Otherwise we would be throughing away potentially valuable data that the model can learn from.

In [103]:
# Using the found best parameters to train the model again
last_model = XGBClassifier(subsample = 0.865,
 scale_pos_weight = 2,
 n_estimators =  554,
 max_depth = 4,
 learning_rate = 0.3,
 objective = 'binary:logistic',
 reg_lambda = 1)

In [106]:
last_model.fit(X_resampled, y_resampled)

### Pickling the model

In [107]:
import pickle

In [108]:

filename = 'last_model.sav'

In [109]:

pickle.dump(last_model, open(filename, 'wb'))

In [110]:
load_model = pickle.load(open(filename, 'rb'))

In [112]:
model_score_r1 = load_model.score(X_test, y_test)

In [113]:
model_score_r1

1.0