# Who cut my card?

## Setup

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml, time, sys, os

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  
sns.set_style("darkgrid")

DATASET = "Churn_Kaggle"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = True

SEED = 666

## Imports

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import chi2,SelectPercentile, RFECV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_recall_curve, auc, make_scorer
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
import warnings

## Dataset

In [47]:
df_train = pd.read_pickle(f"{ROOT}/data/df_train.pkl")
df_test = pd.read_pickle(f"{ROOT}/data/df_test.pkl")
print(f"Train:{df_train.shape}", f"Test:{df_test.shape}")
df_train.head(1)

Train:(6291, 30) Test:(3542, 29)


Unnamed: 0,Customer,Churn,Card,Start_Date,Customer_Service_Calls,Credit_Limit,Total_Revolving_Balance,Average_Open_To_Buy,Average_Utilisation_Ratio,Age,Gender,Education,Marital_Status,Dependents,Income,Days,Credit_Card,Current,Deposit,Investment,Joint,Loan,Mortgage,On_Demand_Deposit,Num_Transaction,Max_Transaction,Min_Transaction,Mean_Transaction,Std_Transaction,Sum_Transaction
0,797197508,No,0,2020-06-01,1,2315.0,1565.0,750.0,0.676,48,F,3,Married,2,1,548.0,Yes,Yes,Yes,Yes,No,No,Yes,No,90,342.82,8.59,53.644,51.087679,4827.96


# Model

In [48]:
target = "Churn"

In [49]:
cat_features = [c for c in df_train.select_dtypes("category").columns if c not in target]
num_features = [c for c in df_train.select_dtypes(["int","float"]).columns if c not in target]
features = cat_features + num_features

print(f"Traget: {target}")

print(f"Categorical Features: {cat_features}")
print(f"Numerical Features: {num_features}")
print(f"Number of Features: {len(features)}")

Traget: Churn
Categorical Features: ['Gender', 'Marital_Status', 'Credit_Card', 'Current', 'Deposit', 'Investment', 'Joint', 'Loan', 'Mortgage', 'On_Demand_Deposit']
Numerical Features: ['Customer', 'Card', 'Customer_Service_Calls', 'Credit_Limit', 'Total_Revolving_Balance', 'Average_Open_To_Buy', 'Average_Utilisation_Ratio', 'Age', 'Education', 'Dependents', 'Income', 'Days', 'Num_Transaction', 'Max_Transaction', 'Min_Transaction', 'Mean_Transaction', 'Std_Transaction', 'Sum_Transaction']
Number of Features: 28


In [50]:
df_train[target].value_counts(normalize=True)

No     0.842314
Yes    0.157686
Name: Churn, dtype: float64

In [51]:
list(features).remove('Age')

In [52]:
X_train = df_train[features]
y_train = df_train[target].replace({"No":0,"Yes":1})
X_test = df_test[features]


In [53]:
cat_preprocessor = Pipeline (
    steps = [
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OneHotEncoder(handle_unknown="ignore")),
        ('select', SelectPercentile(chi2, percentile=80)),    
])

num_preprocessor = Pipeline (
    steps = [
        ('impute', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),    
])

preprocessor = ColumnTransformer(
    transformers = [
        ('cat', cat_preprocessor, cat_features),
        ('num', num_preprocessor, num_features), 
])

In [54]:
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [55]:
df_tmp_train = pd.DataFrame(X_train)
df_tmp_test = pd.DataFrame(X_test)

In [56]:
print("Number of features starting:", len(features))

Number of features starting: 28


In [57]:
rfecv = RFECV(estimator = XGBClassifier() , step=1, cv=StratifiedKFold(2), scoring="accuracy", n_jobs = -1)
rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

Optimal number of features : 13


In [58]:
# list of features selected
if len(rfecv.support_)<len(features):
    print("It looks like you have run cells out of order. Runn cells from Pre-Processing Data (2)")

features = pd.DataFrame(X_train).columns
features_selected = np.array(features)[rfecv.support_].tolist()
print(len(features_selected))
print(features_selected)
X_train = pd.DataFrame(X_train)[features_selected]
X_test = pd.DataFrame(X_test)[features_selected]

13
[0, 2, 4, 6, 12, 14, 18, 19, 20, 23, 28, 31, 33]


In [59]:
X_train.head(1)

Unnamed: 0,0,2,4,6,12,14,18,19,20,23,28,31,33
0,1.0,0.0,0.0,0.0,0.0,1.0,-1.307031,-0.696694,0.489673,0.107625,1.053427,-0.343572,0.114217


[0, 2, 4, 6, 12, 14, 18, 19, 20, 23, 28, 31, 33]

In [60]:
def grid(X,y):
    param_grid_rf = {'n_estimators': [100,300,600,1000]}
    param_grid_gb = {'n_estimators': [100, 300, 600, 1000], 'max_depth': [3, 4, 5, 6, 7]}

    scoring = {'accuracy': make_scorer(accuracy_score)}

    rf = RandomForestClassifier()
    gb = GradientBoostingClassifier()

    grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring=scoring, refit='accuracy')
    grid_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring=scoring, refit='accuracy')

    grid_rf.fit(X, y)
    grid_gb.fit(X, y)

    print("Random Forest: Best Hyperparameters: ", grid_rf.best_params_, "Accuracy: ", grid_rf.best_score_)
    print("Gradient Boosting: Best Hyperparameters: ", grid_gb.best_params_, "Accuracy: ", grid_gb.best_score_)


In [61]:
# grid(X_train,y_train)

* Random Forest: Best Hyperparameters:  {'n_estimators': 1000} Accuracy:  0.9519954893921161
* Gradient Boosting: Best Hyperparameters:  {'max_depth': 4, 'n_estimators': 300} Accuracy:  0.9534258268921635

In [62]:
rf_pipeline = Pipeline([
    ('clf', RandomForestClassifier(n_estimators=1000, n_jobs=-1)),
])

gb_pipeline = Pipeline([
    ('clf', GradientBoostingClassifier(n_estimators=300,max_depth=4)),
])

xb_pipeline = Pipeline([
    ('clf', XGBClassifier()),
])

voting_clf = VotingClassifier(
    estimators=[('rf', rf_pipeline), ('gb', gb_pipeline), ('xb', xb_pipeline)],
    voting='soft'
)

In [63]:
voting_clf.fit(X_train, y_train)

In [64]:
y_pred = voting_clf.predict(X_train)

In [65]:
print(classification_report(y_train, y_pred, digits=4))

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000      5299
           1     1.0000    1.0000    1.0000       992

    accuracy                         1.0000      6291
   macro avg     1.0000    1.0000    1.0000      6291
weighted avg     1.0000    1.0000    1.0000      6291



In [66]:
y_pred = voting_clf.predict(X_test)

In [67]:
df_pred = pd.DataFrame(y_pred)
print(df_pred.shape, X_test.shape)

(3542, 1) (3542, 13)


In [68]:
df = df_test.copy()
df[target] = df_pred[0]
df = df[["Customer",target]].replace({0:"No",1:"Yes"})
df.head()

Unnamed: 0,Customer,Churn
0,774663629,No
1,720420396,No
2,815283379,No
3,764861610,No
4,742798818,No


In [69]:
df[target].value_counts(normalize=True)

No     0.851496
Yes    0.148504
Name: Churn, dtype: float64

In [70]:
# df.to_csv(f"{ROOT}/output/submission9.csv", index=False) # SCORE 0.948

In [71]:
# df.to_csv(f"{ROOT}/output/submission18.csv", index=False) # SCORE 0.95