In [58]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [17]:
df = pd.read_csv("data-ready.csv")

In [6]:
df.head()


Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,1,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,2,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,3,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,4,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Convert Yes and No to 1 or 0

In [18]:
yes_no_columns = ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
                  'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
for col in yes_no_columns:
    df[col].replace({'Yes': 1,'No': 0},inplace=True)

In [19]:
for col in df:
    print(f'{col}: {df[col].unique()}') 

Unnamed: 0: [   0    1    2 ... 7029 7030 7031]
gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
PhoneService: [0 1]
MultipleLines: [0 1]
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: [1 0]
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]
Churn: [0 1]


In [20]:
df['gender'].replace({'Female':1,'Male':0},inplace=True)

In [21]:
df.gender.unique()

array([1, 0], dtype=int64)

### One hot encoding for categorical columns

In [22]:
df = pd.get_dummies(data=df, columns=['InternetService','Contract','PaymentMethod'])
df.columns

Index(['Unnamed: 0', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges',
       'Churn', 'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [23]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
5800,5800,1,0,0,0,6,1,0,0,1,...,0,1,0,1,0,0,0,0,0,1
3854,3854,1,0,0,1,26,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
3652,3652,1,0,0,0,10,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
2564,2564,1,0,1,0,72,1,1,0,0,...,0,0,1,0,0,1,0,1,0,0
6798,6798,0,1,1,0,4,1,1,0,0,...,0,1,0,1,0,0,0,0,1,0


In [24]:
df.dtypes

Unnamed: 0                                   int64
gender                                       int64
SeniorCitizen                                int64
Partner                                      int64
Dependents                                   int64
tenure                                       int64
PhoneService                                 int64
MultipleLines                                int64
OnlineSecurity                               int64
OnlineBackup                                 int64
DeviceProtection                             int64
TechSupport                                  int64
StreamingTV                                  int64
StreamingMovies                              int64
PaperlessBilling                             int64
MonthlyCharges                             float64
TotalCharges                               float64
Churn                                        int64
InternetService_DSL                          uint8
InternetService_Fiber optic    

### Train test split

In [26]:
X = df.drop('Churn',axis='columns')
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)

In [27]:
X_train.shape

(5625, 27)

In [28]:
X_test.shape

(1407, 27)

In [29]:
X_train[:10]

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
5655,5655,1,1,0,0,10,1,0,0,0,...,0,1,0,1,0,0,0,1,0,0
101,101,1,0,1,1,1,1,0,0,0,...,0,0,1,1,0,0,0,0,1,0
2616,2616,0,0,1,0,71,1,0,0,1,...,1,0,0,0,0,1,0,1,0,0
392,392,1,1,0,0,2,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1323,1323,0,0,1,0,59,1,1,0,0,...,0,1,0,0,1,0,1,0,0,0
3601,3601,1,0,0,0,13,1,0,1,0,...,1,0,0,0,1,0,0,0,0,1
2768,2768,0,0,1,0,24,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1931,1931,1,0,1,0,51,1,0,1,1,...,1,0,0,0,1,0,1,0,0,0
5378,5378,0,0,0,0,4,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
4324,4324,0,0,0,0,71,1,1,0,0,...,0,0,1,0,0,1,1,0,0,0


### Decision Tree Classifier

In [41]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [43]:
model_dt.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [45]:
y_pred=model_dt.predict(X_test)
y_pred

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [47]:
model_dt.score(X_test,y_test)

0.7846481876332623

In [48]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.90      0.86       999
           1       0.67      0.50      0.57       408

    accuracy                           0.78      1407
   macro avg       0.74      0.70      0.71      1407
weighted avg       0.77      0.78      0.77      1407



In [60]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X,y)

In [61]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [62]:

model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [63]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9036144578313253
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       397
           1       0.91      0.93      0.92       599

    accuracy                           0.90       996
   macro avg       0.90      0.90      0.90       996
weighted avg       0.90      0.90      0.90       996



In [64]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[344  53]
 [ 43 556]]


### Random Forest Classifier

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [51]:
model_rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [53]:

y_pred=model_rf.predict(X_test)

In [54]:
model_rf.score(X_test,y_test)

0.7818052594171997

In [55]:

print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.80      0.92      0.86       999
           1       0.69      0.44      0.54       408

    accuracy                           0.78      1407
   macro avg       0.75      0.68      0.70      1407
weighted avg       0.77      0.78      0.77      1407



In [66]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(X,y)

In [67]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [68]:

model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [69]:
model_rf_smote.fit(xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [70]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [71]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [72]:

print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9090909090909091
              precision    recall  f1-score   support

           0       0.91      0.86      0.89       417
           1       0.91      0.94      0.92       595

    accuracy                           0.91      1012
   macro avg       0.91      0.90      0.91      1012
weighted avg       0.91      0.91      0.91      1012



In [73]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[360  57]
 [ 35 560]]
