In [4]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN

In [5]:
import seaborn as sns

In [6]:
df = pd.read_csv('tel.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_grp_1-12,tenure_grp_13-24,tenure_grp_25-36,tenure_grp_37-48,tenure_grp_49-60,tenure_grp_61-72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [8]:
df.shape

(7032, 52)

In [9]:
df.drop('Unnamed: 0',axis=1,inplace=True)

##### Creating x & y variables 

In [13]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_grp_1-12,tenure_grp_13-24,tenure_grp_25-36,tenure_grp_37-48,tenure_grp_49-60,tenure_grp_61-72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.50,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.30,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,False,True,False,True,False,True,False,...,False,False,False,True,False,True,False,False,False,False
7028,0,103.20,7362.90,True,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7029,0,29.60,346.45,True,False,False,True,False,True,True,...,False,False,True,False,True,False,False,False,False,False
7030,1,74.40,306.60,False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [12]:
y=df.Churn
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

##### Split the dataset into training and testing dataset 

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

##### Decision tree classifier

In [15]:
model_dt = DecisionTreeClassifier(criterion = 'gini', random_state=100, max_depth = 6, min_samples_leaf = 8)

##### Fit the model to training dataset

In [16]:
model_dt.fit(x_train,y_train)

In [17]:
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
model_dt.score(x_test,y_test)

0.7867803837953091

In [19]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1047
           1       0.62      0.42      0.50       360

    accuracy                           0.79      1407
   macro avg       0.72      0.67      0.68      1407
weighted avg       0.77      0.79      0.77      1407



##### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics 
##### to measure the model,as Accuracy is cursed in imbalanced datasets.
##### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, 
##### recall & f1 score is too low for Class 1, i.e. churned customers.
##### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [20]:
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_resample(x,y)

In [27]:
xr_train,xr_test,yr_train,yr_test = train_test_split(x_resampled,y_resampled, test_size = 0.2, random_state = 7)

In [28]:
model_smote = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, random_state = 100, min_samples_leaf = 8)

In [29]:
model_smote.fit(xr_train,yr_train)

In [30]:
yr_pred = model_smote.predict(xr_test)

In [31]:
model_smote.score(xr_test,yr_test)

0.9265202702702703

In [32]:
print(metrics.classification_report(yr_test,yr_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92       512
           1       0.94      0.93      0.93       672

    accuracy                           0.93      1184
   macro avg       0.92      0.93      0.93      1184
weighted avg       0.93      0.93      0.93      1184



In [33]:
cm = confusion_matrix(yr_test,yr_pred)
print(cm)

[[472  40]
 [ 47 625]]


##### Now we can see quite better results, i.e. Accuracy: 93 %, and a very good recall, precision & f1 score for minority class.
##### Let's try with some other classifier.

### Random Forest Classifier

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
model_rf = RandomForestClassifier(criterion = 'gini', random_state = 100, n_estimators=100, min_samples_leaf = 8, max_depth = 6)

In [35]:
model_rf.fit(x_train,y_train)

In [36]:
y_pred_rf = model_rf.predict(x_test)

In [37]:
model_rf.score(x_test,y_test)

0.7953091684434968

In [38]:
print(metrics.classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1047
           1       0.66      0.42      0.51       360

    accuracy                           0.80      1407
   macro avg       0.74      0.67      0.69      1407
weighted avg       0.78      0.80      0.78      1407



In [39]:
sm1 = SMOTEENN()
x_resampled1, y_resampled1 = sm1.fit_resample(x,y)

In [40]:
xr_train1, xr_test1, yr_train1, yr_test1 = train_test_split(x_resampled1,y_resampled1, test_size = 0.2)

In [41]:
model_rf_smote = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_leaf=8, criterion='gini', random_state=100)

In [42]:
model_rf_smote.fit(xr_train1,yr_train1)

In [43]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [44]:
model_rf_smote.score(xr_test1,yr_test1)

0.9421915444348576

In [45]:
print(metrics.classification_report(yr_test1,yr_predict1))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93       522
           1       0.93      0.97      0.95       637

    accuracy                           0.94      1159
   macro avg       0.94      0.94      0.94      1159
weighted avg       0.94      0.94      0.94      1159



In [46]:
print(metrics.confusion_matrix(yr_test1,yr_predict1))

[[477  45]
 [ 22 615]]


##### Random Forest slightly outperforms Decision Tree in terms of recall and F1-score for Class 1 (Churners), 
##### which is critical in churn analysis because identifying churners accurately is usually more important.

### Let's do Hyperparameter Tuning of Random Forest Classifier

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 4, 6, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['gini', 'entropy']
}

In [45]:
rf = RandomForestClassifier(random_state = 100)

In [46]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs = -1)

In [47]:
grid_search.fit(xr_train1,yr_train1)

In [48]:
print('Best Parameters : ',grid_search.best_params_)
print('Best Accuracy : ',grid_search.best_score_)

Best Parameters :  {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Accuracy :  0.9478064754652475


In [50]:
best_rf = grid_search.best_estimator_
best_rf.fit(xr_train1,yr_train1)

In [51]:
yr_predict1_tuned = best_rf.predict(xr_test1)

In [54]:
print('Tuned Model Accuracy : ',best_rf.score(xr_test1,yr_test1))

Tuned Model Accuracy :  0.9555175363558597


In [52]:
print('Classification Report : \n',metrics.classification_report(yr_test1,yr_predict1_tuned))

Classification Report : 
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       505
           1       0.96      0.97      0.96       664

    accuracy                           0.96      1169
   macro avg       0.96      0.95      0.95      1169
weighted avg       0.96      0.96      0.96      1169



In [55]:
print('Confusion Matrix : \n', confusion_matrix(yr_test1,yr_predict1_tuned))

Confusion Matrix : 
 [[475  30]
 [ 22 642]]


##### After performing hyperparameter tuning on the Random Forest model, the results have improved compared to the previous model:
##### -- The model's accuracy increased from 94% to 96%
##### -- Balanced precision and recall
##### -- Improved F1-scores for both classes

## Saving the model

In [57]:
import joblib

In [58]:
filename = 'my_model.sav'

In [59]:
joblib.dump(best_rf,filename)

['my_model.sav']

In [60]:
load_model = joblib.load(filename)

In [61]:
model_score_r1 = load_model.score(xr_test1,yr_test1)

In [62]:
model_score_r1

0.9555175363558597