In [73]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, \
f1_score, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN

In [31]:
df = pd.read_csv("data-ready.csv")

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,1,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,2,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,3,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,4,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [34]:
df = df.drop('Unnamed: 0', axis = 1)

In [36]:
df.sample(10)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
4559,Male,0,No,No,34,Yes,Yes,Fiber optic,No,Yes,No,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,105.35,3540.65,No
4267,Male,0,No,No,45,Yes,No,No,No,No,No,No,No,No,One year,No,Bank transfer (automatic),18.85,867.3,No
1579,Female,1,No,No,65,No,No,DSL,No,Yes,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),53.5,3517.9,No
4014,Female,0,Yes,Yes,1,Yes,No,No,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,20.5,20.5,Yes
2757,Male,0,No,No,25,Yes,No,Fiber optic,No,No,Yes,Yes,Yes,Yes,Two year,No,Bank transfer (automatic),99.3,2513.5,No
1191,Female,1,Yes,No,20,Yes,No,DSL,No,Yes,No,No,No,Yes,Month-to-month,Yes,Electronic check,60.0,1259.35,No
1635,Female,0,Yes,Yes,7,Yes,No,No,No,No,No,No,No,No,One year,No,Mailed check,20.65,134.05,No
4168,Male,0,Yes,Yes,61,Yes,No,No,No,No,No,No,No,No,One year,No,Mailed check,20.4,1226.45,No
949,Male,0,No,No,22,Yes,No,DSL,Yes,No,No,Yes,No,No,Month-to-month,No,Mailed check,55.1,1253.15,No
3915,Male,1,Yes,No,67,Yes,Yes,Fiber optic,No,No,No,No,Yes,Yes,Month-to-month,Yes,Credit card (automatic),94.65,6079.0,No


### Convert Yes and No to 1 or 0

In [37]:
yes_no_columns = ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
                  'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
for col in yes_no_columns:
    df[col].replace({'Yes': 1,'No': 0},inplace=True)

In [38]:
for col in df:
    print(f'{col}: {df[col].unique()}') 

gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
PhoneService: [0 1]
MultipleLines: [0 1]
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: [1 0]
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]
Churn: [0 1]


In [39]:
df['gender'].replace({'Female':1,'Male':0},inplace=True)

In [40]:
df.gender.unique()

array([1, 0], dtype=int64)

### One hot encoding for categorical columns

In [41]:
df = pd.get_dummies(data=df, columns=['InternetService','Contract','PaymentMethod'])
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [42]:
df.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
4457,0,0,0,0,31,0,0,0,0,1,...,1,0,0,0,1,0,0,0,1,0
3630,1,0,0,1,11,1,0,0,0,0,...,0,0,1,1,0,0,0,1,0,0
2027,1,0,1,0,15,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
4617,1,1,0,0,8,1,0,0,0,1,...,0,1,0,1,0,0,1,0,0,0
6819,0,0,0,0,35,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1


In [43]:
df.dtypes

gender                                       int64
SeniorCitizen                                int64
Partner                                      int64
Dependents                                   int64
tenure                                       int64
PhoneService                                 int64
MultipleLines                                int64
OnlineSecurity                               int64
OnlineBackup                                 int64
DeviceProtection                             int64
TechSupport                                  int64
StreamingTV                                  int64
StreamingMovies                              int64
PaperlessBilling                             int64
MonthlyCharges                             float64
TotalCharges                               float64
Churn                                        int64
InternetService_DSL                          uint8
InternetService_Fiber optic                  uint8
InternetService_No             

### Splitting the data into input and output

In [44]:
X = df.drop('Churn',axis='columns')
y = df['Churn']

### As the dataset is highly unbalanced (There's a significant difference in the number of samples for each class.) as shown below, we need to handle this issue before starting training the model.

In [68]:
y.value_counts()/len(y)

0    0.734215
1    0.265785
Name: Churn, dtype: float64

In [69]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X,y)

In [70]:
X_train, X_test, y_train ,y_test = train_test_split(X_resampled, y_resampled,test_size=0.2)

In [74]:
models = {"Decision Tree": DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8),
          "Random Forest": RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8),
}

In [76]:
for i in range(len(list(models))):
    # Train models
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred =  model.predict(X_test)
    
    # Training set performance
    model_train_accuracy_score = accuracy_score(y_train, y_train_pred)
    model_train_f1_score = f1_score(y_train, y_train_pred, average = 'weighted')
    model_train_precision_score = precision_score(y_train, y_train_pred)
    model_train_recall_score = recall_score(y_train, y_train_pred)
    model_train_rocaus_score = roc_auc_score(y_train, y_train_pred)

    
    # Testing set performance
    model_test_accuracy_score = accuracy_score(y_test, y_test_pred)
    model_test_f1_score = f1_score(y_test, y_test_pred, average = 'weighted')
    model_test_precision_score = precision_score(y_test, y_test_pred)
    model_test_recall_score = recall_score(y_test, y_test_pred)
    model_test_rocaus_score = roc_auc_score(y_test, y_test_pred)
    
    print('**', list(models.keys())[i])
    
    print('Model performance for training set')
    print('- Accuracy: {:.4}'.format(model_train_accuracy_score))
    print('- F1 score: {:.4}'.format(model_train_f1_score))
    print('- Precision: {:.4}'.format(model_train_precision_score))
    print('- Recall: {:.4}'.format(model_train_recall_score))
    print('- ROC AUC score: {:.4}'.format(model_train_rocaus_score))
    
    print('---------------------------------------')
    
    print('Model performance for test set')
    print('- Accuracy: {:.4}'.format(model_test_accuracy_score))
    print('- F1 score: {:.4}'.format(model_test_f1_score))
    print('- Precision: {:.4}'.format(model_test_precision_score))
    print('- Recall: {:.4}'.format(model_test_recall_score))
    print('- ROC AUC score: {:.4}'.format(model_test_rocaus_score))
    
    print('='*35)
    print('\n')

** Decision Tree
Model performance for training set
- Accuracy: 0.9447
- F1 score: 0.9446
- Precision: 0.9323
- Recall: 0.9673
- ROC AUC score: 0.9429
---------------------------------------
Model performance for test set
- Accuracy: 0.9426
- F1 score: 0.9424
- Precision: 0.9335
- Recall: 0.9664
- ROC AUC score: 0.9393


** Random Forest
Model performance for training set
- Accuracy: 0.9488
- F1 score: 0.9487
- Precision: 0.9378
- Recall: 0.9689
- ROC AUC score: 0.9472
---------------------------------------
Model performance for test set
- Accuracy: 0.9417
- F1 score: 0.9416
- Precision: 0.936
- Recall: 0.9618
- ROC AUC score: 0.939




### Pickling the model

In [74]:
import pickle

In [75]:

filename = 'model.sav'

In [76]:

pickle.dump(model_rf_smote, open(filename, 'wb'))

In [77]:
load_model = pickle.load(open(filename, 'rb'))

In [78]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [79]:
model_score_r1

0.9090909090909091