## Libraries and datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder



In [2]:
X = pd.read_csv('X.csv')
X_val = pd.read_csv('X_val.csv')
y = pd.read_csv('y.csv')
y_val = pd.read_csv('y_val.csv')
test = pd.read_csv('test.csv')

In [28]:
cols = ['REGION','TOP_PACK', 'TENURE']
for item in cols:
    X[item] = X[item].astype('category')
    X_val[item] = X_val[item].astype('category')

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1723238 entries, 0 to 1723237
Data columns (total 16 columns):
 #   Column          Dtype   
---  ------          -----   
 0   REGION          category
 1   TENURE          category
 2   MONTANT         float64 
 3   FREQUENCE_RECH  float64 
 4   REVENUE         float64 
 5   ARPU_SEGMENT    float64 
 6   FREQUENCE       float64 
 7   DATA_VOLUME     float64 
 8   ON_NET          float64 
 9   ORANGE          float64 
 10  TIGO            float64 
 11  ZONE1           float64 
 12  ZONE2           float64 
 13  REGULARITY      int64   
 14  TOP_PACK        category
 15  FREQ_TOP_PACK   float64 
dtypes: category(3), float64(12), int64(1)
memory usage: 177.5 MB


In [5]:
y = y['CHURN']
y_val = y_val['CHURN']

In [6]:
X['TENURE'] = X['TENURE'].astype('Int32')
X_val['TENURE'] = X_val['TENURE'].astype('Int32')

In [7]:
X['TENURE'] = X['TENURE'].replace({pd.NA: np.nan})
X_val['TENURE'] = X_val['TENURE'].replace({pd.NA: np.nan})

## LightGBM

In [8]:
param_dist = {'max_depth': [25, 50, 75],
              'learning_rate': [0.01,0.05,0.1],
              'num_leaves': [300,900,1200],
              'n_estimators': [200],
              'min_data_in_leaf': [500, 2000, 5000]
             }
lg = lgb.LGBMClassifier(silent=False)
grid_search_lg = HalvingGridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=False)
grid_search_lg.fit(X, y)



[LightGBM] [Info] Number of positive: 323274, number of negative: 1399964
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2563
[LightGBM] [Info] Number of data points in the train set: 1723238, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187597 -> initscore=-1.465702
[LightGBM] [Info] Start training from score -1.465702


HalvingGridSearchCV(cv=3, estimator=LGBMClassifier(silent=False), n_jobs=-1,
                    param_grid={'learning_rate': [0.01, 0.05, 0.1],
                                'max_depth': [25, 50, 75],
                                'min_data_in_leaf': [500, 2000, 5000],
                                'n_estimators': [200],
                                'num_leaves': [300, 900, 1200]},
                    scoring='roc_auc', verbose=False)

In [9]:
grid_search_lg.best_score_

0.9310067420674458

In [10]:
grid_search_lg.best_params_

{'learning_rate': 0.01,
 'max_depth': 50,
 'min_data_in_leaf': 500,
 'n_estimators': 200,
 'num_leaves': 1200}

In [20]:
lg_train = lgb.Dataset(X, label=y)

In [21]:
params = {'learning_rate': 0.01,
 'max_depth': 50,
 'min_data_in_leaf': 500,
 'n_estimators': 200,
 'num_leaves': 1200}

In [34]:
cate_features_name = ['REGION','TOP_PACK', 'TENURE']
model2 = lgb.train(params, lg_train, categorical_feature = cate_features_name)

TypeError: Training only accepts Dataset object

In [36]:
lgb_ypred = model2.predict(X_val)

In [39]:
lgb_ypred = np.where(lgb_ypred > 0.5, 1, 0)

In [43]:
eval_metrics(y_val, lgb_ypred)

Precision Score:  0.709
Recall Score:  0.595
F1 Score:  0.647
Accuracy Score:  0.879
ROC AUC:  0.77


In [41]:
def eval_metrics(y_test, y_pred):
    print('Precision Score: ', round(precision_score(y_val, y_pred), 3))
    print('Recall Score: ', round(recall_score(y_val, y_pred), 3))
    print('F1 Score: ', round(f1_score(y_val, y_pred), 3))
    print('Accuracy Score: ', round(accuracy_score(y_val, y_pred), 3))
    print('ROC AUC: ', round(roc_auc_score(y_val, y_pred), 3))

In [22]:
def auc2(m, train, test): 
    return (round(roc_auc_score(y_val,m.predict(train),2)))

SyntaxError: unexpected EOF while parsing (<ipython-input-22-db28e6304158>, line 2)

In [None]:
def auc2(model, train, test): 
    return (round(roc_auc_score(y_val, y_pred), 3))

In [None]:
d_train = lgb.Dataset(train, label=y_train)
params = {"max_depth": 50, "learning_rate" : 0.1, "num_leaves": 900,  "n_estimators": 300}

# Without Categorical Features
model2 = lgb.train(params, d_train)
auc2(model2, train, test)

#With Catgeorical Features
cate_features_name = ["MONTH","DAY","DAY_OF_WEEK","AIRLINE","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT"]
model2 = lgb.train(params, d_train, categorical_feature = cate_features_name)
auc2(model2, train, test)

## CatBoost

In [11]:
X['TENURE'] = X['TENURE'].replace({pd.NA: 'NaN'})
X_val['TENURE'] = X_val['TENURE'].replace({pd.NA: 'NaN'})

In [12]:
X['REGION'] = X['REGION'].replace({np.nan: 'NaN'})
X_val['REGION'] = X_val['REGION'].replace({np.nan: 'NaN'})
X['TOP_PACK'] = X['TOP_PACK'].replace({pd.NA: 'NaN'})
X_val['TOP_PACK'] = X_val['TOP_PACK'].replace({pd.NA: 'NaN'})

In [13]:
cols = ['REGION','TOP_PACK', 'TENURE']
for item in cols:
    X[item] = X[item].astype('category')

In [14]:
cat_features_index = 0, 1, 14
params = {'depth': [4, 7, 10],
          'learning_rate' : [0.03, 0.1, 0.15],
         'l2_leaf_reg': [1,4,9],
         'iterations': [100, 300]}
cb = CatBoostClassifier(logging_level='Silent')
cb_model = HalvingGridSearchCV(cb, params, scoring='roc_auc', cv = 3, verbose=False)
cb_model.fit(X, y, cat_features = cat_features_index)

162 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
162 fits failed with the following error:
Traceback (most recent call last):
  File "_catboost.pyx", line 2651, in _catboost._set_features_order_data_pd_data_frame_categorical_column
  File "_catboost.pyx", line 1858, in _catboost.get_id_object_bytes_string_representation
_catboost.CatBoostError: bad object for id: 3.0

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/alena/.pyenv/versions/3.7.3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/alena/.pyenv/versions/3.7.3/l

6 fits failed out of a total of 6.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "_catboost.pyx", line 2651, in _catboost._set_features_order_data_pd_data_frame_categorical_column
  File "_catboost.pyx", line 1858, in _catboost.get_id_object_bytes_string_representation
_catboost.CatBoostError: bad object for id: 3.0

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/alena/.pyenv/versions/3.7.3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/alena/.pyenv/versions/3.7.3/lib/pyt

CatBoostError: Invalid type for cat_feature category for [feature_idx=1]=3.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [None]:
cb_model.best_score_

In [None]:
cb_model.best_params_

In [None]:

cat_features_index = [0,1,2,3,4,5,6]

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

params = {'depth': [4, 7, 10],
          'learning_rate' : [0.03, 0.1, 0.15],
         'l2_leaf_reg': [1,4,9],
         'iterations': [300]}
cb = cb.CatBoostClassifier()
cb_model = GridSearchCV(cb, params, scoring="roc_auc", cv = 3)
cb_model.fit(train, y_train)

With Categorical features
clf = cb.CatBoostClassifier(eval_metric="AUC", depth=10, iterations= 500, l2_leaf_reg= 9, learning_rate= 0.15)
clf.fit(train,y_train)
auc(clf, train, test)

With Categorical features
clf = cb.CatBoostClassifier(eval_metric="AUC",one_hot_max_size=31, \
                            depth=10, iterations= 500, l2_leaf_reg= 9, learning_rate= 0.15)
clf.fit(train,y_train, cat_features= cat_features_index)
auc(clf, train, test)


## xgboost

In [34]:
encoder = LabelEncoder() 
X['REGION'] = encoder.fit_transform(X['REGION'])
X_val['REGION'] = encoder.transform(X_val['REGION'])
test['REGION'] = encoder.transform(test['REGION'])

In [42]:
X.drop(['TOP_PACK'], axis=1, inplace=True)
X_val.drop(['TOP_PACK'], axis=1, inplace=True)

In [None]:
from sklearn.model_selection import GridSearchCV

param_test1 = {
    'max_depth':range(2,7,2),
    'min_child_weight':range(1,6,2)
}

gsearch1 = GridSearchCV(estimator=model_xgb, 
                        param_grid=param_test1,
                        scoring='roc_auc',
                        n_jobs=-1,
                        cv=5)

gsearch1.fit(X_train, y_train)
svcpred = gsearch1.predict(X_test)

In [None]:
model_xgb = XGBClassifier(learning_rate=0.1,
                          n_estimators=150,
                          objective= 'binary:logistic',
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=27)