### Importing Libraries

In [48]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

#### Reading csv

In [49]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [50]:
df=df.drop('Unnamed: 0',axis=1)

In [51]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.50,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.30,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,False,True,False,True,False,True,False,...,False,False,False,True,False,True,False,False,False,False
7028,0,103.20,7362.90,True,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7029,0,29.60,346.45,True,False,False,True,False,True,True,...,False,False,True,False,True,False,False,False,False,False
7030,1,74.40,306.60,False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [52]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

##### Train Test Split

In [53]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [54]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train[[ 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(x_train[[ 'MonthlyCharges', 'TotalCharges']])
x_test[[ 'MonthlyCharges', 'TotalCharges']] = scaler.transform(x_test[[ 'MonthlyCharges', 'TotalCharges']])

#### Decision Tree Classifier

In [55]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=12, min_samples_leaf=8)

In [56]:
model_dt.fit(x_train,y_train)

In [57]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [58]:
model_dt.score(x_test,y_test)

0.7505330490405118

In [59]:


print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83      1017
           1       0.56      0.46      0.51       390

    accuracy                           0.75      1407
   macro avg       0.68      0.66      0.67      1407
weighted avg       0.74      0.75      0.74      1407




###### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

###### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

###### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [60]:
sm = SMOTEENN()


In [61]:
x_train,x_test,y_train,y_test=train_test_split(x, y,test_size=0.2)
xr_train, yr_train = sm.fit_resample(x_train,y_train)

In [62]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [63]:
model_dt_smote.fit(xr_train,yr_train)
y_predict = model_dt_smote.predict(x_test)
model_score_r = model_dt_smote.score(x_test, y_test)
print(model_score_r)
print(metrics.classification_report(y_test, y_predict))

0.759772565742715
              precision    recall  f1-score   support

           0       0.91      0.76      0.83      1059
           1       0.51      0.77      0.61       348

    accuracy                           0.76      1407
   macro avg       0.71      0.76      0.72      1407
weighted avg       0.81      0.76      0.77      1407



In [64]:
print(metrics.confusion_matrix(y_test, y_predict))

[[802 257]
 [ 81 267]]


###### Now we can see quite better results, i.e. Accuracy: 80 %, and a very good recall 90, precision & f1 score for minority class.

###### Let's try with some other classifier.

#### Random Forest Classifier

In [65]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [66]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 200,max_depth=10, min_samples_leaf=1,max_features='log2',min_samples_split=2,class_weight={0:1,1:3})

In [67]:
model_rf.fit(x_train,y_train)

In [68]:
y_pred=model_rf.predict(x_test)

In [69]:
model_rf.score(x_test,y_test)

0.7640369580668088

In [70]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.91      0.76      0.83      1059
           1       0.52      0.78      0.62       348

    accuracy                           0.76      1407
   macro avg       0.71      0.77      0.72      1407
weighted avg       0.81      0.76      0.78      1407



In [71]:
sm = SMOTEENN()


In [72]:
#x_train1,x_test1,y_train1,y_test1=train_test_split(x, y,test_size=0.2)

In [73]:
xr_train1, yr_train1 = sm.fit_resample(x,y)

In [74]:
#from sklearn.model_selection import GridSearchCV

'''param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']  # Adding criterion to the grid search
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)
grid_search.fit(xr_train1, yr_train1)'''



"param_grid = {\n    'n_estimators': [100, 200, 300],\n    'max_depth': [None, 10, 20, 30],\n    'min_samples_split': [2, 5, 10],\n    'min_samples_leaf': [1, 2, 4],\n    'max_features': ['auto', 'sqrt', 'log2'],\n    'criterion': ['gini', 'entropy']  # Adding criterion to the grid search\n}\n\ngrid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, \n                           cv=3, n_jobs=-1, verbose=2)\ngrid_search.fit(xr_train1, yr_train1)"

In [75]:
#grid_search.best_params_

In [76]:
#grid_search.best_estimator_

In [77]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='entropy', random_state = 200,max_depth=20, min_samples_leaf=1,max_features='log2',min_samples_split=2)

In [78]:
model_rf_smote.fit(xr_train1,yr_train1)

In [79]:
yr_predict1 = model_rf_smote.predict(x_test)
yr_predict1

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [91]:
yr_prob = model_rf_smote.predict_proba(x_test)
yr_prob

array([[1.        , 0.        ],
       [0.94      , 0.06      ],
       [0.88      , 0.12      ],
       ...,
       [0.93608696, 0.06391304],
       [0.00746627, 0.99253373],
       [0.81      , 0.19      ]])

In [81]:
model_score_r1 = model_rf_smote.score(x_test, y_test)

In [82]:
print(model_score_r1)
print(metrics.classification_report(y_test, yr_predict1))

0.8017057569296375
              precision    recall  f1-score   support

           0       0.93      0.80      0.86      1059
           1       0.57      0.82      0.67       348

    accuracy                           0.80      1407
   macro avg       0.75      0.81      0.76      1407
weighted avg       0.84      0.80      0.81      1407



In [83]:
print(metrics.confusion_matrix(y_test, yr_predict1))

[[843 216]
 [ 63 285]]


###### With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.

###### We can now further go ahead and create multiple classifiers to see how the model performance is, but that's not covered here, so you can do it by yourself :)

#### Pickling the model

In [84]:
import pickle

In [85]:
filename = 'model.sav'

In [86]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [87]:
load_model = pickle.load(open(filename, 'rb'))

In [88]:
model_score_r1 = load_model.score(x_test, y_test)

In [89]:
model_score_r1

0.8017057569296375

##### Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.