## Modeling

In [None]:
#label encode the target variable
le = LabelEncoder()
data['churn'] = le.fit_transform(data['churn'])

In [None]:
#one hot encode intl plan and voice mail plan features
cat = data[['international_plan','voice_mail_plan']]
cat_ohe = pd.get_dummies(cat,drop_first=True,dtype='int')

#merge the original data with the new cat columns
merged_df = pd.concat([data,cat_ohe],axis=1)
#drop the initial cat columns
merged_df.drop(columns=cat.columns, inplace=True,axis=1)
merged_df[:3]


In [None]:
# separate features and target
X = merged_df.drop("churn",axis=1)
y = merged_df.churn


#split to train and test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
X_train.shape ,X_test.shape ,y_train.shape ,y_test.shape

In [None]:
#balance the training set
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train,y_train)
#check on whether SMOTEN worked
print(f" Original values \n {y_train.value_counts()}\n")
print(f"Smoted values \n {y_train_sm.value_counts()}")

In [None]:
#scaling featues
ss = StandardScaler()
X_train_s = ss.fit_transform(X_train_sm)
X_test_s = ss.transform(X_test)

In [None]:
#fit models
models = {'logistic regression':LogisticRegression(random_state=42,max_iter=1000),
          'Decision tree': DecisionTreeClassifier(random_state=42),
          'Random Forest':RandomForestClassifier(random_state=42,n_estimators=200),
          'Xgboost':XGBClassifier(random_state=42,use_label_encoder=False,eval_metric='logloss')
          }
results = []
for name, model in models.items():
    model.fit(X_train_s,y_train_sm)
    y_pred = model.predict(X_test_s)
    y_prob = model.predict_proba(X_test_s)[:,1]

    results.append({'Model':name,
                    'train score':model.score(X_train_s,y_train_sm),
                    'test score':model.score(X_test_s,y_test),
                    'Accuracy':accuracy_score(y_test,y_pred)*100,
                    'Recall':recall_score(y_test,y_pred)*100,
                    'Precision':precision_score(y_test,y_pred)*100,
                    'F1 score':f1_score(y_test,y_pred)*100,
                    'AUC':roc_auc_score(y_test,y_pred)*100
                })
results_df = pd.DataFrame(results).round(2)
results_df = results_df[["Model",'train score','test score', "Accuracy", "Recall", "Precision", "F1 score", "AUC"]]
print(results_df.to_string(index=False))

In [None]:
#split the dataset using cross validation 
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True,pos_label=1),
    'pr_auc': make_scorer(average_precision_score, needs_proba=True)
}
cv = StratifiedKFold(random_state=42,n_splits=10,shuffle=True)
cv_results = {}
for name,model in models.items():
    cv_result = cross_validate(model,X,y,cv=cv,scoring=scoring,return_train_score=True)
    cv_results[name]={metric:cv_result['test_'+metric].mean()*100 for metric in scoring.keys()}
    cv_results[name]['train_accuracy'] = cv_result['train_accuracy'].mean()*100

cv_results_df = pd.DataFrame(cv_results).T.round(2)
print(cv_results_df)

In [None]:
#hyperparaeter tuning using grid search
xgb = XGBClassifier(random_state=42,use_label_encoder=False,eval_metric='logloss')
param_grid_xgb = {'learning_rate':[0.01,0.05,0.1],
              'max_depth':[3,5,7,9],
              'n_estimators':[100,200,300]
}
grid_search_xgb = GridSearchCV(scoring='recall',param_grid=param_grid_xgb, cv=5, estimator=xgb)
grid_search_xgb.fit(X_train_s,y_train_sm)

In [None]:
best_xgb = grid_search_xgb.best_estimator_
print(f'the best parameters are: {grid_search_xgb.best_params_}')

In [None]:
# y pred and y_prob
y_best_pred = best_xgb.predict(X_test_s)
y_best_prob = best_xgb.predict_proba(X_test_s)[:,1]
print(f'classification report:\n{classification_report(y_test,y_best_pred)}')
print("AUC: {:.2f}".format(roc_auc_score(y_test, y_prob)))