In [157]:
import os
import random

import pandas as pd
import numpy as np

import sklearn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, \
    classification_report
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from feature_engine.datetime import DatetimeFeatures
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt


In [58]:
sklearn.set_config(transform_output="pandas")

In [59]:
seed = 2137
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)

## Load data

In [130]:
from ydata_profiling import ProfileReport

train_raw = pd.read_excel("../ecomm-data/E Commerce Dataset.xlsx", sheet_name='E Comm')
# ProfileReport(train_raw).to_file('report.html')

In [131]:
train = train_raw.dropna()
plt.scatter(train['DaySinceLastOrder'], train['Churn'])
plt.show()

  plt.show()


In [132]:
TARGET = "Churn"

In [133]:
features = ['Tenure', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp', 'PreferedOrderCat', 'SatisfactionScore', 'CouponUsed', 'OrderCount', 'DaySinceLastOrder']
cat_features = ['PreferredPaymentMode', 'Gender', 'PreferedOrderCat']


In [142]:
y = train[TARGET]
X = train[features]

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

## Modelling

- date features extraction

### Attempt 1

In [144]:
# dtf = DatetimeFeatures(features_to_extract=["year", "month", "day_of_month", "day_of_week"])

# ct = make_column_transformer((dtf, ["Date"]), remainder="passthrough", verbose_feature_names_out=False)

# preprocessor = make_pipeline(ct)


In [192]:
DEFAULT_MODEL_CONFIG = {
    "iterations": 5000,
    "depth": 3,
    "learning_rate": 0.05,
    "verbose": 250,
    "random_seed": seed,
    "auto_class_weights": "SqrtBalanced",
}

In [193]:
model = CatBoostClassifier(**DEFAULT_MODEL_CONFIG, cat_features=cat_features)

In [194]:
X_train.head()

Unnamed: 0,Tenure,PreferredPaymentMode,Gender,HourSpendOnApp,PreferedOrderCat,SatisfactionScore,CouponUsed,OrderCount,DaySinceLastOrder
4191,18.0,UPI,Male,3.0,Laptop & Accessory,1,1.0,2.0,4.0
3346,1.0,Debit Card,Male,3.0,Mobile Phone,2,2.0,2.0,2.0
582,20.0,Credit Card,Male,3.0,Laptop & Accessory,3,0.0,2.0,7.0
3542,3.0,Credit Card,Female,3.0,Mobile Phone,2,1.0,2.0,9.0
526,20.0,E wallet,Female,2.0,Laptop & Accessory,2,5.0,7.0,7.0


In [195]:
model.fit(X_train, y_train)

0:	learn: 0.6653559	total: 16.1ms	remaining: 1m 20s
250:	learn: 0.3353794	total: 1.44s	remaining: 27.2s
500:	learn: 0.2996654	total: 3.44s	remaining: 30.9s
750:	learn: 0.2754135	total: 4.9s	remaining: 27.7s
1000:	learn: 0.2571949	total: 6.13s	remaining: 24.5s
1250:	learn: 0.2427882	total: 7.1s	remaining: 21.3s
1500:	learn: 0.2310749	total: 9.65s	remaining: 22.5s
1750:	learn: 0.2208000	total: 11.9s	remaining: 22s
2000:	learn: 0.2127036	total: 13.1s	remaining: 19.6s
2250:	learn: 0.2051192	total: 14.7s	remaining: 17.9s
2500:	learn: 0.1983627	total: 18s	remaining: 18s
2750:	learn: 0.1913076	total: 20.8s	remaining: 17s
3000:	learn: 0.1854807	total: 23.4s	remaining: 15.6s
3250:	learn: 0.1805680	total: 25.3s	remaining: 13.6s
3500:	learn: 0.1761219	total: 28.6s	remaining: 12.3s
3750:	learn: 0.1714033	total: 30.9s	remaining: 10.3s
4000:	learn: 0.1669203	total: 32.7s	remaining: 8.17s
4250:	learn: 0.1628560	total: 35.1s	remaining: 6.19s
4500:	learn: 0.1580341	total: 37.4s	remaining: 4.14s
4750:	l

<catboost.core.CatBoostClassifier at 0x7f2f02437eb0>

In [197]:
y_pred = model.predict(X_test)

In [198]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.93       630
           1       0.62      0.70      0.65       125

    accuracy                           0.88       755
   macro avg       0.78      0.81      0.79       755
weighted avg       0.88      0.88      0.88       755



### Attempt 2

In [79]:
dtf = DatetimeFeatures(features_to_extract=["year", "month", "day_of_month", "day_of_week"])

ct = make_column_transformer((dtf, ["Date"]), remainder="passthrough", verbose_feature_names_out=False)

preprocessor = make_pipeline(ct)


In [80]:
DEFAULT_MODEL_CONFIG = {
    "early_stopping_rounds": 50,
    "use_best_model": True,
    "depth": 4,
    "learning_rate": 0.01,
    "verbose": 250,
    "random_seed": seed,
}

In [81]:
model = CatBoostClassifier(**DEFAULT_MODEL_CONFIG, eval_metric="Accuracy", cat_features=["Sector"])

In [82]:
model.fit(preprocessor.fit_transform(X_train), y_train, eval_set=(preprocessor.fit_transform(X_test), y_test))

0:	learn: 0.4857363	test: 0.3815385	best: 0.3815385 (0)	total: 6.67ms	remaining: 6.66s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4338461538
bestIteration = 67

Shrink model to first 68 iterations.


<catboost.core.CatBoostClassifier at 0x7031198cb2b0>

In [83]:
y_pred = model.predict(preprocessor.fit_transform(X_test))

In [84]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.4338461538461538
