In [43]:
import pandas as pd
import gzip

import joblib
from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    classification_report,
    roc_auc_score,
    recall_score,
)
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import optuna
import json


In [2]:
cleaned_dataset_address = "dataset/interim/cleaned_dataset.csv.gz"
SEED = 87

In [3]:
with gzip.open(cleaned_dataset_address, mode="r") as f:
        df = pd.read_csv(f)

## Feature Extraction

In [4]:
app_cols = [
    "is_using_RitimGo",
    "is_using_İzleGo",
    "is_using_CüzdanX",
    "is_using_Konuşalım",
    "is_using_HızlıPazar",
]


In [5]:
def insert_binary_app_combinations(df: pd.DataFrame, columns):
    interactions = pd.DataFrame()

    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            col1, col2 = columns[i], columns[j]
            interaction_col_name = f"{col1}_X_{col2}"
            interactions[interaction_col_name] = df[col1] * df[col2]

    return pd.concat([df, interactions], axis=1)


In [6]:
df = ( 
    df
    .pipe(insert_binary_app_combinations, columns=app_cols)
    .assign(**{
        "yearly_tenure" : lambda df: df["tenure"] * 12,
        "total_avg_call" : lambda df: df["avg_call_duration"] + df["roaming_usage"],
    })
    .astype({col: "int32" for col in df.select_dtypes("int64").columns})
    .astype({col: "float32" for col in df.select_dtypes("float64").columns})
)

In [7]:
TARGET = "churn"
X = df.loc[:, df.columns != "churn"]
y = df.loc[:, df.columns == "churn"]

In [8]:
# %80 train, %10 val, %10 test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, stratify=y_test, random_state=SEED
)


In [9]:
def get_class_weights(y_array: pd.DataFrame):
    classes = np.unique(y_val)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_val.values.ravel())
    class_weights = dict(zip(classes, weights))
    return class_weights

## Feature Selection

In [27]:
X_train_sel, X_val_sel, y_train_sel, y_val_sel = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

feature_names = list(X_train_sel.columns)
train_pool = Pool(X_train_sel, y_train_sel, feature_names=feature_names)
test_pool = Pool(X_val_sel, y_val_sel, feature_names=feature_names)

class_weights = get_class_weights(y_train_sel)
model = CatBoostClassifier(iterations=500, random_seed=SEED, early_stopping_rounds=50, class_weights=class_weights)
num_of_columns_to_select = round(len(X_train.columns) ** (1 / 2))

summary = model.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select='0-29',
    num_features_to_select=num_of_columns_to_select,
    steps=3,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level='Silent',
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
model

In [None]:
# with open("models/v1/feature_selection_summary.json", "w") as f:
#     json.dump(summary, f)

In [11]:
best_columns = ['age', 'service_type', 'overdue_payments', 'auto_payment', 'avg_top_up_count']

## Model Baseline

### Probabilistic Random Prediction

In [65]:
def generate_random_values_numpy(series: pd.Series):
    churn_count = series.value_counts()[1]
    non_churn_count = series.value_counts()[0]
    
    total = (churn_count + non_churn_count)
    probability = churn_count / total

    # Use numpy.random.binomial for efficient generation of 0s and 1s
    random_array = np.random.binomial(1, probability, size=total) 

    return random_array


In [67]:
pred = generate_random_values_numpy(y_val)

In [68]:
print(classification_report(y_val, pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    986635
           1       0.01      0.02      0.01     13365

    accuracy                           0.97   1000000
   macro avg       0.50      0.50      0.50   1000000
weighted avg       0.97      0.97      0.97   1000000



In [69]:
print(f1_score(y_val, pred, zero_division=1))

0.014988814317673379


### Catboost Baseline

In [71]:
cat = CatBoostClassifier()

In [72]:
model = cat.fit(X_train, y_train)

Learning rate set to 0.478138
0:	learn: 0.1453362	total: 547ms	remaining: 9m 6s
1:	learn: 0.0736644	total: 956ms	remaining: 7m 57s
2:	learn: 0.0633777	total: 1.37s	remaining: 7m 35s
3:	learn: 0.0612379	total: 1.79s	remaining: 7m 25s
4:	learn: 0.0601021	total: 2.15s	remaining: 7m 7s
5:	learn: 0.0597034	total: 2.52s	remaining: 6m 56s
6:	learn: 0.0594161	total: 2.9s	remaining: 6m 51s
7:	learn: 0.0590848	total: 3.29s	remaining: 6m 48s
8:	learn: 0.0589733	total: 3.69s	remaining: 6m 46s
9:	learn: 0.0588907	total: 4.09s	remaining: 6m 45s
10:	learn: 0.0588201	total: 4.49s	remaining: 6m 43s
11:	learn: 0.0587305	total: 4.85s	remaining: 6m 39s
12:	learn: 0.0586622	total: 5.25s	remaining: 6m 39s
13:	learn: 0.0585240	total: 5.67s	remaining: 6m 39s
14:	learn: 0.0584561	total: 6.09s	remaining: 6m 40s
15:	learn: 0.0584196	total: 6.47s	remaining: 6m 38s
16:	learn: 0.0583873	total: 6.87s	remaining: 6m 37s
17:	learn: 0.0583453	total: 7.3s	remaining: 6m 38s
18:	learn: 0.0583206	total: 7.69s	remaining: 6m 

In [73]:
pred = cat.predict(X_val)

In [74]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    986635
           1       0.73      0.02      0.04     13365

    accuracy                           0.99   1000000
   macro avg       0.86      0.51      0.52   1000000
weighted avg       0.98      0.99      0.98   1000000



In [75]:
print(f1_score(y_val, pred, zero_division=1))

0.03719369894982497


In [82]:
model.save_model("models/v1/baseline_catboost")

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    986635
           1       0.73      0.02      0.04     13365

    accuracy                           0.99   1000000
   macro avg       0.86      0.51      0.52   1000000
weighted avg       0.98      0.99      0.98   1000000


f1: 0.03719369894982497

### LightGBM Baseline

In [77]:
lgb = LGBMClassifier()

In [84]:
lgb = lgb.fit(X_train, y_train.values.ravel())

[LightGBM] [Info] Number of positive: 106922, number of negative: 7893078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068963 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2446
[LightGBM] [Info] Number of data points in the train set: 8000000, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.013365 -> initscore=-4.301642
[LightGBM] [Info] Start training from score -4.301642


In [85]:
pred = lgb.predict(X_val)

In [86]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    986635
           1       0.50      0.01      0.02     13365

    accuracy                           0.99   1000000
   macro avg       0.74      0.51      0.51   1000000
weighted avg       0.98      0.99      0.98   1000000



In [87]:
print(f1_score(y_val, pred, zero_division=1))

0.021525845658222287


              precision    recall  f1-score   support


           0       0.99      1.00      0.99    986635
           1       0.50      0.01      0.02     13365

    accuracy                           0.99   1000000
   macro avg       0.74      0.51      0.51   1000000
weighted avg       0.98      0.99      0.98   1000000

0.021525845658222287

## With class weights

In [None]:
cat_with_weights = CatBoostClassifier(class_weights=get_class_weights(y_train))

In [106]:
cat_with_weights = cat_with_weights.fit(X_train, y_train)

Learning rate set to 0.478138
0:	learn: 0.5240564	total: 520ms	remaining: 8m 39s
1:	learn: 0.4792418	total: 974ms	remaining: 8m 6s
2:	learn: 0.4687009	total: 1.39s	remaining: 7m 42s
3:	learn: 0.4624546	total: 1.84s	remaining: 7m 38s
4:	learn: 0.4600839	total: 2.26s	remaining: 7m 29s
5:	learn: 0.4585740	total: 2.65s	remaining: 7m 19s
6:	learn: 0.4579940	total: 3.02s	remaining: 7m 8s
7:	learn: 0.4574968	total: 3.41s	remaining: 7m 3s
8:	learn: 0.4545469	total: 3.84s	remaining: 7m 3s
9:	learn: 0.4534210	total: 4.27s	remaining: 7m 2s
10:	learn: 0.4523398	total: 4.72s	remaining: 7m 4s
11:	learn: 0.4517340	total: 5.11s	remaining: 7m
12:	learn: 0.4513655	total: 5.56s	remaining: 7m 2s
13:	learn: 0.4510574	total: 5.97s	remaining: 7m
14:	learn: 0.4507655	total: 6.4s	remaining: 7m
15:	learn: 0.4505345	total: 6.78s	remaining: 6m 56s
16:	learn: 0.4499834	total: 7.22s	remaining: 6m 57s
17:	learn: 0.4495897	total: 7.62s	remaining: 6m 55s
18:	learn: 0.4493722	total: 8s	remaining: 6m 53s
19:	learn: 0.44

In [107]:
pred = cat_with_weights.predict(X_val)

In [108]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       1.00      0.74      0.85    986635
           1       0.04      0.83      0.08     13365

    accuracy                           0.74   1000000
   macro avg       0.52      0.78      0.47   1000000
weighted avg       0.98      0.74      0.84   1000000



In [109]:
print(f1_score(y_val, pred, zero_division=1))

0.07941758044220744


In [110]:
cat_with_weights.save_model("models/v1/catboost_with_weights")

              precision    recall  f1-score   support

           0       1.00      0.74      0.85    986635
           1       0.04      0.83      0.08     13365

    accuracy                           0.74   1000000
   macro avg       0.52      0.78      0.47   1000000
weighted avg       0.98      0.74      0.84   1000000


0.07941758044220744

## Final (Class weights + hyperparameter optimization)

In [35]:
def optimize_catboost(trial):
    X_train_op, X_valid_op, y_train_op, y_valid_op = train_test_split(X_train.loc[:, best_columns], y_train, test_size=0.25, stratify=y_train)

    class_weights = get_class_weights(y_train_op)
    param_grid = {
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000, step=100),
    }


    cat_op = CatBoostClassifier(**param_grid, class_weights=class_weights)
    model = cat_op.fit(X_train_op, y_train_op)

    val_pred = model.predict(X_valid_op)

    return f1_score(y_valid_op, val_pred)
    

In [36]:
best_cat_hypers_with_cw_study = optuna.create_study()

[I 2025-02-07 14:38:24,860] A new study created in memory with name: no-name-a1e35ceb-ef52-4eee-9a30-33cde8540076


In [37]:
best_params = best_cat_hypers_with_cw_study.optimize(
    optimize_catboost, n_trials=25, n_jobs=-1, show_progress_bar=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

0:	learn: 0.6783148	total: 1.67s	remaining: 44m 35s
1:	learn: 0.6648907	total: 2.64s	remaining: 35m 10s
0:	learn: 0.6687589	total: 2.75s	remaining: 1h 4m 14s
2:	learn: 0.6527524	total: 3.82s	remaining: 33m 52s
0:	learn: 0.6892315	total: 1.59s	remaining: 50m 13s
0:	learn: 0.6730612	total: 1.43s	remaining: 9m 31s
3:	learn: 0.6417798	total: 5.53s	remaining: 36m 47s
1:	learn: 0.6854143	total: 3.38s	remaining: 53m 33s
1:	learn: 0.6555559	total: 3.1s	remaining: 10m 16s
1:	learn: 0.6475176	total: 5.47s	remaining: 1h 3m 40s
0:	learn: 0.6696300	total: 2.52s	remaining: 29m 20s
4:	learn: 0.6318573	total: 7.01s	remaining: 37m 16s
2:	learn: 0.6403153	total: 4.57s	remaining: 10m 5s
2:	learn: 0.6816908	total: 5.19s	remaining: 54m 41s
0:	learn: 0.6905144	total: 2.27s	remaining: 1h 30s
5:	learn: 0.6228936	total: 8.92s	remaining: 39m 29s
0:	learn: 0.6923622	total: 3.05s	remaining: 55m 51s
1:	learn: 0.6489560	total: 5.37s	remaining: 31m 14s
3:	learn: 0.6270530	total: 6.82s	remaining: 11m 14s
2:	learn: 0.

: 

: 

In [None]:
def optimize_catboost(trial):
    X_train_op, X_valid_op, y_train_op, y_valid_op = train_test_split(X_train.loc[:, best_columns], y_train, test_size=0.25, stratify=y_train)

    class_weights = get_class_weights(y_train_op)
    param_grid = {
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000, step=100),
    }


    cat_op = CatBoostClassifier(**param_grid, class_weights=class_weights)
    model = cat_op.fit(X_train_op, y_train_op)

    val_pred = model.predict(X_valid_op)

    return f1_score(y_valid_op, val_pred)
    

In [None]:
cat_final_best_params = {'max_depth': 5, 'min_child_samples': 78, 'learning_rate': 0.0023182678872253, 'n_estimators': 100}

In [27]:
cat_final = CatBoostClassifier(**best_params, class_weights=get_class_weights(y_train))

In [28]:
cat_final_model = cat_final.fit(X_train, y_train)

0:	learn: 0.6920343	total: 657ms	remaining: 1m 37s
1:	learn: 0.6907927	total: 1.34s	remaining: 1m 39s
2:	learn: 0.6896941	total: 2.01s	remaining: 1m 38s
3:	learn: 0.6886013	total: 2.67s	remaining: 1m 37s
4:	learn: 0.6875192	total: 3.41s	remaining: 1m 38s
5:	learn: 0.6863038	total: 4.2s	remaining: 1m 40s
6:	learn: 0.6850979	total: 4.87s	remaining: 1m 39s
7:	learn: 0.6840294	total: 5.61s	remaining: 1m 39s
8:	learn: 0.6828378	total: 6.31s	remaining: 1m 38s
9:	learn: 0.6817817	total: 7.04s	remaining: 1m 38s
10:	learn: 0.6807335	total: 7.78s	remaining: 1m 38s
11:	learn: 0.6795620	total: 8.48s	remaining: 1m 37s
12:	learn: 0.6785225	total: 9.16s	remaining: 1m 36s
13:	learn: 0.6774911	total: 9.97s	remaining: 1m 36s
14:	learn: 0.6763377	total: 10.7s	remaining: 1m 36s
15:	learn: 0.6753146	total: 11.6s	remaining: 1m 37s
16:	learn: 0.6741766	total: 12.4s	remaining: 1m 37s
17:	learn: 0.6731672	total: 13.1s	remaining: 1m 36s
18:	learn: 0.6721617	total: 13.9s	remaining: 1m 35s
19:	learn: 0.6710420	to

In [29]:
pred = cat_final.predict(X_val)

In [30]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       1.00      0.69      0.81    986635
           1       0.04      0.89      0.07     13365

    accuracy                           0.69   1000000
   macro avg       0.52      0.79      0.44   1000000
weighted avg       0.98      0.69      0.80   1000000



In [31]:
print(f1_score(y_val, pred))

0.07064514012727457


In [25]:
cat_final.save_model("models/v1/catboost_final_f1")

catboost with f1 metric
    
              precision    recall  f1-score   support

           0       1.00      0.69      0.81    986635
           1       0.04      0.89      0.07     13365

    accuracy                           0.69   1000000
   macro avg       0.52      0.79      0.44   1000000
weighted avg       0.98      0.69      0.80   1000000

0.0706576185478485

In [12]:
X_train_op, X_valid_op, y_train_op, y_valid_op = train_test_split(X_train.loc[:, best_columns], y_train, test_size=0.25, stratify=y_train)

In [13]:
class_weights = get_class_weights(y_train_op)

In [14]:
class_weights

{0: 0.5067730214314311, 1: 37.41114852225963}

In [15]:
def optimize_lightgbm(trial):
    param_grid = {
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000, step=100),
    }


    lgb_op = LGBMClassifier(**param_grid, is_unbalance=True)
    model = lgb_op.fit(X_train_op, y_train_op.values.ravel())

    val_pred = model.predict(X_valid_op)

    return f1_score(y_valid_op, val_pred)

In [16]:
best_lgb_hypers_with_cw_study = optuna.create_study(direction="maximize")

[I 2025-02-07 14:53:27,032] A new study created in memory with name: no-name-0eb3ccc1-f55e-4c13-a842-546e13cbb183


In [17]:
best_params = best_lgb_hypers_with_cw_study.optimize(
    optimize_lightgbm, n_trials=25, n_jobs=-1, show_progress_bar=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 80192, number of negative: 5919808
[LightGBM] [Info] Number of positive: 80192, number of negative: 5919808
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.167559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130
[LightGBM] [Info] Number of data points in the train set: 6000000, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.013365 -> initscore=-4.301636
[LightGBM] [Info] Start training from score -4.301636
[LightGBM] [Info] Number of positive: 80192, number of negative: 5919808
[LightGBM] [Info] Number of positive: 80192, number of negative: 5919808
[LightGBM] [Info] Number of positive: 80192, number of negative: 5919808
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.162223 seconds.
You can set `force_row_wise=true` to remove 

In [20]:
best_lgb_hypers_with_cw_study.best_params

{'max_depth': 8,
 'min_child_samples': 50,
 'learning_rate': 0.003026160590703767,
 'n_estimators': 500}

{'max_depth': 8,
 'min_child_samples': 50,
 'learning_rate': 0.003026160590703767,
 'n_estimators': 500}

In [22]:
lgb = LGBMClassifier(**best_lgb_hypers_with_cw_study.best_params, is_unbalance=True)
model = lgb.fit(X_train.loc[:, best_columns], y_train.values.ravel())

[LightGBM] [Info] Number of positive: 106922, number of negative: 7893078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035774 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130
[LightGBM] [Info] Number of data points in the train set: 8000000, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.013365 -> initscore=-4.301642
[LightGBM] [Info] Start training from score -4.301642


In [55]:
joblib.dump(model, "models/v1/lightgbm_model_final.pkl")

['models/v1/lightgbm_model_final.pkl']

In [51]:
pred = model.predict(X_train.loc[:, best_columns])

In [52]:
print(classification_report(y_train, pred))

              precision    recall  f1-score   support

           0       1.00      0.78      0.88   7893078
           1       0.04      0.74      0.08    106922

    accuracy                           0.78   8000000
   macro avg       0.52      0.76      0.48   8000000
weighted avg       0.98      0.78      0.87   8000000



In [54]:
f1_score(y_train, pred)

0.08344988765667156

In [46]:
pred = model.predict(X_val.loc[:, best_columns])

In [47]:
pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int32)

In [49]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       1.00      0.78      0.88    986635
           1       0.04      0.74      0.08     13365

    accuracy                           0.78   1000000
   macro avg       0.52      0.76      0.48   1000000
weighted avg       0.98      0.78      0.87   1000000



In [50]:
f1_score(y_val, pred)

0.0834572222269418

In [63]:
X_, X_demo, y_train, y_demo = train_test_split(X_test, y_test, test_size=0.1, random_state=SEED, stratify=y_test)

In [66]:
presentation_df = pd.concat([X_demo, y_demo], axis=1)

In [72]:
presentation_df[best_columns]

Unnamed: 0,age,service_type,overdue_payments,auto_payment,avg_top_up_count,churn
7244916,26,1,0,1.0,48,0
5244478,33,3,1,0.0,0,0
3439599,18,3,2,1.0,0,0
4863854,23,1,0,0.0,28,1
8990081,41,2,0,1.0,0,0
...,...,...,...,...,...,...
1211252,27,1,0,0.0,61,0
3806615,31,2,1,0.0,0,0
6158346,19,1,0,0.0,51,0
747665,45,2,0,0.0,0,0


In [None]:
# presentation_df.to_csv("dataset/processed/presentation.csv", index=False)