In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score
from category_encoders.cat_boost import CatBoostEncoder

In [2]:
df_train = pd.read_csv('../data/client_attrition_train.csv')
df_test = pd.read_csv('../data/client_attrition_test.csv')

In [3]:
df_train = df_train.drop('customer_id', axis=1)
df_test = df_test.drop('customer_id', axis=1)

df_train['customer_available_credit_limit_below_1450'] = np.where(df_train['customer_available_credit_limit'] < 1450, 1, 0)
df_train['customer_available_credit_limit'] = np.log(df_train['customer_available_credit_limit'])

df_test['customer_available_credit_limit_below_1450'] = np.where(df_test['customer_available_credit_limit'] < 1450, 1, 0)
df_test['customer_available_credit_limit'] = np.log(df_test['customer_available_credit_limit'])

df_train['total_products_0_5'] = np.where(df_train['total_products'] <= 5, 1, 0)
df_train['total_products_6_10'] = np.where((df_train['total_products'] > 5) & (df_train['total_products'] <= 10), 1, 0)
df_train['total_products_more_than_10'] = np.where(df_train['total_products'] > 10, 1, 0)
df_test['total_products_0_5'] = np.where(df_test['total_products'] <= 5, 1, 0)
df_test['total_products_6_10'] = np.where((df_test['total_products'] > 5) & (df_test['total_products'] <= 10), 1, 0)
df_test['total_products_more_than_10'] = np.where(df_test['total_products'] > 10, 1, 0)

df_train['period_inactive_0_3'] = np.where(df_train['period_inactive'] <= 3, 1, 0)
df_test['period_inactive_0_3'] = np.where(df_test['period_inactive'] <= 3, 1, 0)

df_train['credit_card_debt_balance_0'] = np.where(df_train['credit_card_debt_balance'] == 0, 1, 0)
df_test['credit_card_debt_balance_0'] = np.where(df_test['credit_card_debt_balance'] == 0, 1, 0)

df_train['remaining_credit_limit'] = np.log(df_train['remaining_credit_limit'])
df_test['remaining_credit_limit'] = np.log(df_test['remaining_credit_limit'])

df_train['transaction_amount_ratio_0_05'] = np.where(df_train['transaction_amount_ratio'] <= 0.5, 1, 0)
df_train['transaction_amount_ratio_05_1'] = np.where(
    (df_train['transaction_amount_ratio'] > 0.5) & (df_train['transaction_amount_ratio'] <= 1), 1, 0)
df_train['transaction_amount_ratio_1_2'] = np.where(
    (df_train['transaction_amount_ratio'] > 1) & (df_train['transaction_amount_ratio'] <= 2), 1, 0)
df_train['transaction_amount_ratio_more_than_2'] = np.where(df_train['transaction_amount_ratio'] > 2, 1, 0)
df_test['transaction_amount_ratio_0_05'] = np.where(df_test['transaction_amount_ratio'] <= 0.5, 1, 0)
df_test['transaction_amount_ratio_05_1'] = np.where(
    (df_test['transaction_amount_ratio'] > 0.5) & (df_test['transaction_amount_ratio'] <= 1), 1, 0)
df_test['transaction_amount_ratio_1_2'] = np.where(
    (df_test['transaction_amount_ratio'] > 1) & (df_test['transaction_amount_ratio'] <= 2), 1, 0)
df_test['transaction_amount_ratio_more_than_2'] = np.where(df_test['transaction_amount_ratio'] > 2, 1, 0)
df_train['total_transaction_amount'] = np.log(df_train['total_transaction_amount'])

df_test['total_transaction_amount'] = np.log(df_test['total_transaction_amount'])
df_train['total_transaction_count_0_25'] = np.where(df_train['total_transaction_count'] <= 25, 1, 0)
df_train['total_transaction_count_26_50'] = np.where(
    (df_train['total_transaction_count'] > 25) & (df_train['total_transaction_count'] <= 50), 1, 0)
df_train['total_transaction_count_51_100'] = np.where(
    (df_train['total_transaction_count'] > 50) & (df_train['total_transaction_count'] <= 100), 1, 0)
df_train['total_transaction_count_more_than_100'] = np.where(df_train['total_transaction_count'] > 100, 1, 0)
df_test['total_transaction_count_0_25'] = np.where(df_test['total_transaction_count'] <= 25, 1, 0)
df_test['total_transaction_count_26_50'] = np.where(
    (df_test['total_transaction_count'] > 25) & (df_test['total_transaction_count'] <= 50), 1, 0)
df_test['total_transaction_count_51_100'] = np.where(
    (df_test['total_transaction_count'] > 50) & (df_test['total_transaction_count'] <= 100), 1, 0)
df_test['total_transaction_count_more_than_100'] = np.where(df_test['total_transaction_count'] > 100, 1, 0)

df_train['transaction_count_ratio_0_05'] = np.where(df_train['transaction_count_ratio'] <= 0.5, 1, 0)
df_train['transaction_count_ratio_05_1'] = np.where(
    (df_train['transaction_count_ratio'] > 0.5) & (df_train['transaction_count_ratio'] <= 1), 1, 0)
df_train['transaction_count_ratio_1_2'] = np.where(
    (df_train['transaction_count_ratio'] > 1) & (df_train['transaction_count_ratio'] <= 2), 1, 0)
df_train['transaction_count_ratio_more_than_2'] = np.where(df_train['transaction_count_ratio'] > 2, 1, 0)
df_test['transaction_count_ratio_0_05'] = np.where(df_test['transaction_count_ratio'] <= 0.5, 1, 0)
df_test['transaction_count_ratio_05_1'] = np.where(
    (df_test['transaction_count_ratio'] > 0.5) & (df_test['transaction_count_ratio'] <= 1), 1, 0)
df_test['transaction_count_ratio_1_2'] = np.where(
    (df_test['transaction_count_ratio'] > 1) & (df_test['transaction_count_ratio'] <= 2), 1, 0)
df_test['transaction_count_ratio_more_than_2'] = np.where(df_test['transaction_count_ratio'] > 2, 1, 0)

df_train['average_utilization_0_02'] = np.where(df_train['average_utilization'] <= 0.2, 1, 0)
df_train['average_utilization_02_06'] = np.where(
    (df_train['average_utilization'] > 0.2) & (df_train['average_utilization'] <= 0.6), 1, 0)
df_train['average_utilization_more_than_06'] = np.where(df_train['average_utilization'] > 0.6, 1, 0)
df_test['average_utilization_0_02'] = np.where(df_test['average_utilization'] <= 0.2, 1, 0)
df_test['average_utilization_02_06'] = np.where(
    (df_test['average_utilization'] > 0.2) & (df_test['average_utilization'] <= 0.6), 1, 0)
df_test['average_utilization_more_than_06'] = np.where(df_test['average_utilization'] > 0.6, 1, 0)

In [4]:
cb_encoder = CatBoostEncoder()
cols_to_be_encoded = ['customer_sex', 'customer_education', 'customer_civil_status', 'customer_salary_range',
                      'credit_card_classification']

df_train['y'] = df_train.account_status.map({'open': 0, 'closed': 1})
features_train = df_train.drop(columns=['account_status', 'y'])

cb_encoder.fit(X=features_train, y=df_train['y'], cols=cols_to_be_encoded, handle_missing='return_nan')
df_train_encoded = cb_encoder.transform(features_train)

df_train = pd.concat([df_train_encoded, df_train['y']], axis=1)
df_test = cb_encoder.transform(df_test)

In [5]:
X_train, y_train = df_train.drop(columns='y'), df_train.y
X_test = df_test

In [6]:
std = StandardScaler()
num_col = ['customer_age',
 'customer_number_of_dependents',
 'customer_relationship_length',
 'customer_available_credit_limit',
 'total_products',
 'period_inactive',
 'contacts_in_last_year',
 'credit_card_debt_balance',
 'remaining_credit_limit',
 'transaction_amount_ratio',
 'total_transaction_amount',
 'total_transaction_count',
 'transaction_count_ratio',
 'average_utilization']
arr = std.fit_transform(X_train[num_col])
arr = pd.DataFrame(arr, columns = num_col)
X_train = X_train.reset_index(drop=True)
X_train.drop(num_col, axis=1, inplace=True)
X_train = pd.concat([X_train, arr], axis=1)

In [7]:
arr1 = std.transform(X_test[num_col])
arr1 = pd.DataFrame(arr1, columns = num_col)
X_test = X_test.reset_index(drop=True)
X_test.drop(num_col, axis=1, inplace=True)
X_test = pd.concat([X_test, arr1], axis=1)
X_test.head()

Unnamed: 0,customer_sex,customer_education,customer_civil_status,customer_salary_range,credit_card_classification,customer_available_credit_limit_below_1450,total_products_0_5,total_products_6_10,total_products_more_than_10,period_inactive_0_3,...,total_products,period_inactive,contacts_in_last_year,credit_card_debt_balance,remaining_credit_limit,transaction_amount_ratio,total_transaction_amount,total_transaction_count,transaction_count_ratio,average_utilization
0,0.146807,0.152032,0.151271,0.169894,0.160979,0,1,0,0,1,...,-0.988694,-0.337598,-1.315636,0.556092,1.653133,-0.116365,0.876686,0.815516,-0.314313,-0.804778
1,0.146807,0.152016,0.151271,0.135196,0.160979,0,1,0,0,1,...,-0.046282,-1.327136,-0.411616,-0.84768,0.350713,-0.724365,-0.571729,-0.462635,-0.099894,-0.81529
2,0.17197,0.159382,0.151271,0.171022,0.160979,0,1,0,0,1,...,-0.674557,-1.327136,-0.411616,-0.196104,-1.02203,-0.227346,2.973127,0.517281,0.057328,0.912837
3,0.17197,0.155692,0.151271,0.171022,0.160979,0,0,1,0,1,...,0.581993,-1.327136,-0.411616,0.272638,-0.538808,-0.243541,0.284557,0.517281,-0.455532,0.70195
4,0.146807,0.155692,0.151271,0.171022,0.160979,0,0,1,0,1,...,0.581993,-0.337598,-0.411616,0.64935,1.391418,-0.086054,-0.048361,0.26165,-0.133067,-0.738717


In [8]:
selected_features = ['total_transaction_amount', 'total_transaction_count', 'transaction_amount_ratio', 'credit_card_debt_balance', 'total_products', 'transaction_count_ratio', 'customer_age', 'contacts_in_last_year', 'period_inactive', 'credit_card_debt_balance_0', 'period_inactive_0_3', 'total_products_0_5', 'total_products_more_than_10', 'total_transaction_count_0_25', 'total_transaction_count_more_than_100', 'transaction_amount_ratio_0_05', 'transaction_amount_ratio_1_2', 'transaction_amount_ratio_more_than_2', 'transaction_count_ratio_0_05', 'transaction_count_ratio_more_than_2', 'customer_relationship_length', 'average_utilization_0_02', 'customer_civil_status', 'total_transaction_count_51_100', 'average_utilization_02_06', 'transaction_amount_ratio_05_1', 'remaining_credit_limit', 'average_utilization_more_than_06', 'total_transaction_count_26_50', 'customer_sex', 'credit_card_classification', 'transaction_count_ratio_1_2', 'average_utilization', 'transaction_count_ratio_05_1', 'customer_number_of_dependents', 'customer_salary_range', 'customer_education', 'customer_available_credit_limit', 'customer_available_credit_limit_below_1450', 'total_products_6_10']

X_train_final = X_train[selected_features]
X_test_final = X_test[selected_features]

In [9]:
scale_pos_weight = (y_train.shape[0] - y_train.sum()) / y_train.sum()

model = xgb.XGBClassifier(subsample = 0.9, n_estimators = 200,
                        max_depth = 4, learning_rate = 0.07,
                        colsample_bytree = 0.8, colsample_bylevel = 0.7,
                        objective='binary:logistic', eval_metric='aucpr', scale_pos_weight=scale_pos_weight)
model.fit(X_train_final, y_train)

y_pred_train = model.predict_proba(X_train_final)[:, 1]
y_pr_train = model.predict(X_train_final)
acc_train = accuracy_score(y_train,y_pr_train)
class_re = classification_report(y_train,y_pr_train)
con_mat = confusion_matrix(y_train,y_pr_train)
auc_train = roc_auc_score(y_train, y_pred_train)
balanced_accuracy_train = balanced_accuracy_score(y_train, y_pr_train)
print("Train confusion Matrix:\n",con_mat)
print("\n")
print("Train accuracy of the model:",(acc_train)*100)
print("\n")
print("Train classification report:\n",class_re)
print("Train AUC of the model:",auc_train)
print("Train balanced accuracy of the model:", balanced_accuracy_train)

Train confusion Matrix:
 [[8247  253]
 [  21 1606]]


Train accuracy of the model: 97.2943616075837


Train classification report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98      8500
           1       0.86      0.99      0.92      1627

    accuracy                           0.97     10127
   macro avg       0.93      0.98      0.95     10127
weighted avg       0.98      0.97      0.97     10127

Train AUC of the model: 0.9975977439531436
Train balanced accuracy of the model: 0.9786640514841463


In [10]:
preds = model.predict(X_test_final)

In [11]:
preds

array([0, 1, 0, ..., 0, 0, 1])

In [12]:
pd.DataFrame(preds).to_csv('../data/client_attrition_test_preds.csv')