## Libraries and datasets

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder



In [3]:
X = pd.read_csv('X.csv')
X_val = pd.read_csv('X_val.csv')
y = pd.read_csv('y.csv')
y_val = pd.read_csv('y_val.csv')
test = pd.read_csv('test.csv')

In [4]:
cols = ['REGION','TOP_PACK', 'TENURE']
for item in cols:
    X[item] = X[item].astype('category')
    X_val[item] = X_val[item].astype('category')

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1723238 entries, 0 to 1723237
Data columns (total 16 columns):
 #   Column          Dtype   
---  ------          -----   
 0   REGION          category
 1   TENURE          category
 2   MONTANT         float64 
 3   FREQUENCE_RECH  float64 
 4   REVENUE         float64 
 5   ARPU_SEGMENT    float64 
 6   FREQUENCE       float64 
 7   DATA_VOLUME     float64 
 8   ON_NET          float64 
 9   ORANGE          float64 
 10  TIGO            float64 
 11  ZONE1           float64 
 12  ZONE2           float64 
 13  REGULARITY      int64   
 14  TOP_PACK        category
 15  FREQ_TOP_PACK   float64 
dtypes: category(3), float64(12), int64(1)
memory usage: 177.5 MB


In [6]:
y = y['CHURN']
y_val = y_val['CHURN']

In [7]:
X['TENURE'] = X['TENURE'].astype('Int32')
X_val['TENURE'] = X_val['TENURE'].astype('Int32')

In [8]:
X['TENURE'] = X['TENURE'].replace({pd.NA: np.nan})
X_val['TENURE'] = X_val['TENURE'].replace({pd.NA: np.nan})

## LightGBM

In [41]:
def eval_metrics(y_test, y_pred):
    print('Precision Score: ', round(precision_score(y_val, y_pred), 3))
    print('Recall Score: ', round(recall_score(y_val, y_pred), 3))
    print('F1 Score: ', round(f1_score(y_val, y_pred), 3))
    print('Accuracy Score: ', round(accuracy_score(y_val, y_pred), 3))
    print('ROC AUC: ', round(roc_auc_score(y_val, y_pred), 3))

In [8]:
param_dist = {'max_depth': [25, 50, 75],
              'learning_rate': [0.01,0.05,0.1],
              'num_leaves': [300,900,1200],
              'n_estimators': [200],
              'min_data_in_leaf': [500, 2000, 5000]
             }
lg = lgb.LGBMClassifier(silent=False)
grid_search_lg = HalvingGridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=False)
grid_search_lg.fit(X, y)



[LightGBM] [Info] Number of positive: 323274, number of negative: 1399964
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2563
[LightGBM] [Info] Number of data points in the train set: 1723238, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187597 -> initscore=-1.465702
[LightGBM] [Info] Start training from score -1.465702


HalvingGridSearchCV(cv=3, estimator=LGBMClassifier(silent=False), n_jobs=-1,
                    param_grid={'learning_rate': [0.01, 0.05, 0.1],
                                'max_depth': [25, 50, 75],
                                'min_data_in_leaf': [500, 2000, 5000],
                                'n_estimators': [200],
                                'num_leaves': [300, 900, 1200]},
                    scoring='roc_auc', verbose=False)

In [9]:
grid_search_lg.best_score_

0.9310067420674458

In [10]:
grid_search_lg.best_params_

{'learning_rate': 0.01,
 'max_depth': 50,
 'min_data_in_leaf': 500,
 'n_estimators': 200,
 'num_leaves': 1200}

In [10]:
lg_train = lgb.Dataset(X, label=y)

In [9]:
params = {'learning_rate': 0.01,
 'max_depth': 50,
 'min_data_in_leaf': 500,
 'n_estimators': 200,
 'num_leaves': 1200}

In [11]:
cate_features_name = ['REGION','TOP_PACK', 'TENURE']
model2 = lgb.train(params, lg_train, categorical_feature = cate_features_name)

New categorical_feature is ['REGION', 'TENURE', 'TOP_PACK']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2562
[LightGBM] [Info] Number of data points in the train set: 1723238, number of used features: 16
[LightGBM] [Info] Start training from score 0.187597


In [36]:
lgb_ypred = model2.predict(X_val)

In [39]:
lgb_ypred = np.where(lgb_ypred > 0.5, 1, 0)

In [43]:
eval_metrics(y_val, lgb_ypred)

Precision Score:  0.709
Recall Score:  0.595
F1 Score:  0.647
Accuracy Score:  0.879
ROC AUC:  0.77
