### Importing Libraries

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

#### Reading csv

In [2]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29,29,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56,1889,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53,108,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42,1840,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70,151,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [3]:
df=df.drop('Unnamed: 0',axis=1)

In [4]:
#Creating the X variable
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29,29,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56,1889,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53,108,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42,1840,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70,151,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84,1990,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103,7362,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29,346,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74,306,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [5]:
#Creating the Y variable
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

##### Train Test Split

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

#### Decision Tree Classifier

In [7]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [8]:
model_dt.fit(x_train,y_train)

In [9]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [10]:
model_dt.score(x_test,y_test)

0.798862828713575

In [11]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87      1031
           1       0.64      0.58      0.61       376

    accuracy                           0.80      1407
   macro avg       0.74      0.73      0.74      1407
weighted avg       0.79      0.80      0.80      1407



###### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

###### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

###### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [12]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x, y)

In [13]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [14]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [15]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9175965665236051
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       539
           1       0.90      0.95      0.93       626

    accuracy                           0.92      1165
   macro avg       0.92      0.91      0.92      1165
weighted avg       0.92      0.92      0.92      1165



In [16]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[474  65]
 [ 31 595]]


###### Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

###### Let's try with some other classifier.

#### Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [19]:
model_rf.fit(x_train,y_train)

In [20]:
y_pred=model_rf.predict(x_test)

In [21]:
model_rf.score(x_test,y_test)

0.8081023454157783

In [22]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88      1031
           1       0.71      0.48      0.57       376

    accuracy                           0.81      1407
   macro avg       0.77      0.71      0.73      1407
weighted avg       0.80      0.81      0.80      1407



In [23]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [24]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [25]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [26]:
model_rf_smote.fit(xr_train1,yr_train1)

In [27]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [28]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [29]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9241379310344827
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       491
           1       0.92      0.96      0.94       669

    accuracy                           0.92      1160
   macro avg       0.93      0.92      0.92      1160
weighted avg       0.92      0.92      0.92      1160



In [30]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[433  58]
 [ 30 639]]


###### With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.

###### We can now further go ahead and create multiple classifiers to see how the model performance is, but that's not covered here, so you can do it by yourself :)

#### XGBoost

In [31]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [32]:
!pip install optuna



In [33]:
import optuna

In [34]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from optuna import Trial
import optuna

In [56]:
def objective(trial):
  params = {
      'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
      'max_depth': trial.suggest_int('max_depth', 1, 8),
      'min_child_weight': trial.suggest_float('min_child_weight', 0.5, 10),
      'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 20),
      'n_estimators': trial.suggest_int('n_estimators', 20, 200),
      'max_leaves': trial.suggest_int('max_leaves', 2, 20),
      'verbosity':0,
      'objective': 'binary:logistic',
      'eval_metric': 'logloss',
  }
  model = XGBClassifier(**params)
  scores = cross_val_score(model, X = x_train, y = y_train, cv=3, scoring='roc_auc').mean()
  return scores

In [57]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

[I 2024-06-21 22:49:27,206] A new study created in memory with name: no-name-5f994abe-4c7d-4891-95f8-67e7ebee6fe4
[I 2024-06-21 22:49:27,326] Trial 0 finished with value: 0.8449944744322151 and parameters: {'learning_rate': 0.15144595208048667, 'max_depth': 8, 'min_child_weight': 7.35722848328259, 'reg_lambda': 4.047367561286995, 'n_estimators': 36, 'max_leaves': 7}. Best is trial 0 with value: 0.8449944744322151.
[I 2024-06-21 22:49:27,541] Trial 1 finished with value: 0.8454636822918099 and parameters: {'learning_rate': 0.14524036514424019, 'max_depth': 2, 'min_child_weight': 3.995315379782441, 'reg_lambda': 11.457470292381116, 'n_estimators': 123, 'max_leaves': 6}. Best is trial 1 with value: 0.8454636822918099.
[I 2024-06-21 22:49:27,754] Trial 2 finished with value: 0.8391951515744638 and parameters: {'learning_rate': 0.1839468947593362, 'max_depth': 4, 'min_child_weight': 7.232543176400711, 'reg_lambda': 1.5282664228027658, 'n_estimators': 70, 'max_leaves': 18}. Best is trial 1 w

In [50]:
best_xgb = XGBClassifier(**study.best_params)
best_xgb

In [51]:
best_xgb.fit(xr_train, yr_train)

In [55]:
y_pred_test_xgb = best_xgb.predict(xr_test)
y_pred_test_xgb

array([1, 1, 0, ..., 1, 0, 1])

In [37]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = XGBClassifier()

# Fit the model to the training data
model.fit(xr_train,yr_train)

# Make predictions on the test data
y_pred = model.predict(xr_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(yr_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9553648068669528


#### Performing PCA

In [38]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [39]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [40]:
model.fit(xr_train_pca,yr_train1)

In [41]:
yr_predict_pca = model.predict(xr_test_pca)

In [42]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [43]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7258620689655172
              precision    recall  f1-score   support

           0       0.68      0.68      0.68       491
           1       0.76      0.76      0.76       669

    accuracy                           0.73      1160
   macro avg       0.72      0.72      0.72      1160
weighted avg       0.73      0.73      0.73      1160



##### With PCA, we couldn't see any better results, hence let's finalise the model which was created by RF Classifier, and save the model so that we can use it in a later stage :)

#### Pickling the model

In [44]:
import pickle

In [45]:
filename = 'model.sav'

In [46]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [47]:
load_model = pickle.load(open(filename, 'rb'))

In [48]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [49]:
model_score_r1

0.9241379310344827

##### Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.