In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from google.colab import drive

In [2]:
%matplotlib inline
sns.set_style('darkgrid')
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("Train.csv")
df.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,NO,54,On net 200F=Unlimited _call24H,8.0,0
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,,I 18-21 month,,,,,,,,,,,,NO,4,,,1
2,00001654a9d9f96303d9969d0a4a851714a4bb57,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,NO,17,On-net 1000F=10MilF;10d,1.0,0
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,NO,62,"Data:1000F=5GB,7d",11.0,0
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,NO,11,Mixt 250F=Unlimited_call24H,2.0,0


## Data Preprocessing

### Drop Columns

In [4]:
df.drop('user_id', axis=1, inplace=True) # Just a unique Identifier

In [5]:
df.drop('MRG', axis=1, inplace=True) # All data points have the same value

### Transforming Column "TENURE" into numerical values

In [6]:
df['TENURE'].value_counts()

Unnamed: 0_level_0,count
TENURE,Unnamed: 1_level_1
K > 24 month,2043201
I 18-21 month,45278
H 15-18 month,26006
G 12-15 month,14901
J 21-24 month,12725
F 9-12 month,9328
E 6-9 month,1839
D 3-6 month,770


In [7]:
mapping_dict = {'K > 24 month': 24, 'I 18-21 month': 18, 'H 15-18 month': 15, 'G 12-15 month': 12, 'J 21-24 month': 21, 'F 9-12 month': 9, 'E 6-9 month': 6, 'D 3-6 month': 3}
df['TENURE'] = df["TENURE"].apply(lambda x: mapping_dict[x])

In [8]:
df['TENURE'].value_counts()

Unnamed: 0_level_0,count
TENURE,Unnamed: 1_level_1
24,2043201
18,45278
15,26006
12,14901
21,12725
9,9328
6,1839
3,770


### New Column unlimited_pack

1: "TOP_PACK" Contains the substring "unlimited"

0: Otherwise

In [9]:
df['unlimited_pack'] = 0
df.loc[df['TOP_PACK'].str.contains("unlimited", case=False, na=False), 'unlimited_pack'] = 1

In [10]:
df[['unlimited_pack', "CHURN"]].corr()["CHURN"]

Unnamed: 0,CHURN
unlimited_pack,-0.161632
CHURN,1.0


## New Column popular_pack

0: first quartile (x < 25)

1: second quartile (25 < x < 50)

2: third quartile (50 < x < 75)

3: fourth quartile (75 < x)

**Quartiles** of the value_counts of TOP_PACK column

We expect users of popular packages  

In [11]:
df['TOP_PACK'].value_counts().describe()

Unnamed: 0,count
count,140.0
mean,8938.957143
std,33491.194518
min,1.0
25%,4.5
50%,112.0
75%,1355.5
max,317802.0


In [12]:
low = df['TOP_PACK'].value_counts().describe()["25%"]
mid = df['TOP_PACK'].value_counts().describe()["50%"]
high = df['TOP_PACK'].value_counts().describe()["75%"]

val_counts = df['TOP_PACK'].value_counts()

def pack_pop(pack):
  if pack is np.nan:
    return -1

  val = val_counts[pack]

  if val < low:
    return 1
  elif val < mid:
    return 2
  elif val < high:
    return 3
  else:
    return 4

df["PACK_POP"] = df['TOP_PACK'].apply(pack_pop)

In [13]:
df[["PACK_POP", "CHURN"]].corr()["CHURN"]

Unnamed: 0,CHURN
PACK_POP,-0.444462
CHURN,1.0


### NA>=10 Column

In [14]:
df[df['CHURN'] == 1].shape[0] / len(df)

0.1875473527052322

In [15]:
val_counts = df[df.isna().sum(axis=1) >= 10]['CHURN'].value_counts()
val_counts[1] / (val_counts[0] + val_counts[1])

np.float64(0.4525901246103)

In [16]:
df['NA>=10'] = False
df.loc[df.isna().sum(axis=1) >= 10, 'NA>=10'] = True

In [17]:
df[['NA>=10', 'CHURN']].corr()["CHURN"]

Unnamed: 0,CHURN
NA>=10,0.494754
CHURN,1.0


In [18]:
df.drop("NA>=10", axis=1, inplace=True)

## Null Value_counts

In [19]:
df["NULL_COUNTS"] = df.isna().sum(axis=1)

In [20]:
df[["NULL_COUNTS", "CHURN"]].corr()['CHURN']

Unnamed: 0,CHURN
NULL_COUNTS,0.530775
CHURN,1.0


In [21]:
df.head()

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN,unlimited_pack,PACK_POP,NULL_COUNTS
0,FATICK,24,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,54,On net 200F=Unlimited _call24H,8.0,0,1,4,0
1,,18,,,,,,,,,,,,4,,,1,0,-1,14
2,,24,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,17,On-net 1000F=10MilF;10d,1.0,0,0,4,4
3,DAKAR,24,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,62,"Data:1000F=5GB,7d",11.0,0,0,4,2
4,DAKAR,24,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,11,Mixt 250F=Unlimited_call24H,2.0,0,1,4,4


### Numerical Feature Brute Forcing

1. Log transform of each numerical feature
2. Region-Wise feature means of all numerical features
3. Interaction(+, -, *, /) between *High Importance Features*

In [22]:
# Numerical Features with highest feature importance (XgBoost)
num_features_high = ['REGULARITY', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE_RECH', 'MONTANT', 'ON_NET']
num_features_all = df.select_dtypes(include=["number"]).columns.tolist()
num_features_all.remove("CHURN")

In [23]:
def brute_force_features(df, feature_list_all, feature_list_high):
  data = df.copy()
  n = len(feature_list_high)
  epsilon = 1e-6

  # Grouping by Region
  region_groups = data[feature_list_all + ['REGION']].groupby("REGION").mean()
  for feature in feature_list_all:
    feature_name = "REGION_MEAN_" + feature
    region_groups[feature_name] = region_groups[feature]
    region_groups.drop(feature, axis=1, inplace=True)

  data = pd.merge(data, region_groups, on='REGION', how='left')


  # log features
  for feature in feature_list_all:
    feature_name = 'log(' + feature + ")"
    data[feature_name] = np.log(data[feature] + epsilon)

  # Cross Features
  for i in range(n-1):
    for j in range(i+1, n):
      feature_add = feature_list_high[i] + "+" + feature_list_high[j]
      feature_sub = feature_list_high[i] + "-" + feature_list_high[j]
      feature_mult = feature_list_high[i] + "*" + feature_list_high[j]
      feature_div = feature_list_high[i] + "/" + feature_list_high[j]

      data[feature_add] = data[feature_list_high[i]] + data[feature_list_high[j]]
      data[feature_sub] = data[feature_list_high[i]] - data[feature_list_high[j]]
      data[feature_mult] = data[feature_list_high[i]] * data[feature_list_high[j]]
      data[feature_div] = data[feature_list_high[i]] / (data[feature_list_high[j]] + epsilon)



  return data


In [24]:
data = brute_force_features(df, num_features_all, num_features_high)

In [25]:
data.columns

Index(['REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE',
       'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE',
       ...
       'FREQUENCE_RECH*MONTANT', 'FREQUENCE_RECH/MONTANT',
       'FREQUENCE_RECH+ON_NET', 'FREQUENCE_RECH-ON_NET',
       'FREQUENCE_RECH*ON_NET', 'FREQUENCE_RECH/ON_NET', 'MONTANT+ON_NET',
       'MONTANT-ON_NET', 'MONTANT*ON_NET', 'MONTANT/ON_NET'],
      dtype='object', length=114)

In [26]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.metrics import make_scorer, average_precision_score
from xgboost import XGBClassifier

In [27]:
drop = ['REGION', 'TOP_PACK']
data.drop(drop, axis=1, inplace=True)

In [28]:
X = data.drop('CHURN', axis=1)
y = data['CHURN']

In [29]:
# 2) Fit XGBoost with categorical support
from xgboost import XGBClassifier

# xgb_clf = XGBClassifier(
#     device='gpu',
#     tree_method="hist",              # required for categorical splits
#     enable_categorical=True,         # turns on native categorical handling
#     eval_metric="auc",
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=8,
#     subsample=0.8,
#     colsample_bytree=0.8
#     # optional: n_estimators=500, learning_rate=0.05, max_depth=8, subsample=0.8, colsample_bytree=0.8
# )
# xgb_clf.fit(X, y)

In [30]:
# # stratified k-fold cross validation
# from sklearn.model_selection import StratifiedKFold

# print(X.shape, y.shape)
# print("Percentage of churned customers in the whole dataset:", y.mean())

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# for train_index, val_index in skf.split(X, y):
#     X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#     y_train, y_val = y.iloc[train_index], y.iloc[val_index]
#     break  # Just take the first fold for demonstration

# X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [31]:
# import numpy as np
# import pandas as pd

# def nonfinite_report(df):
#     mask = ~np.isfinite(df.select_dtypes(include=['number']))
#     cols_with_nonfinite = mask.any(axis=0)
#     rows_with_nonfinite = mask.any(axis=1)
#     print("Columns with non-finite values:")
#     print(cols_with_nonfinite[cols_with_nonfinite].index.tolist())
#     print("Number of rows with non-finite:", rows_with_nonfinite.sum())
#     return mask

# mask_nonfinite = nonfinite_report(X)  # run before the pipeline/fit
# # Optional: peek at a few offending rows
# offenders = X[mask_nonfinite.any(axis=1)].head()


In [32]:
# # first row in X
# for i in df.columns:
#   print(df[i].max())

In [33]:
# xgb_train_clf = XGBClassifier(
#     device='gpu',
#     tree_method="hist",              # required for categorical splits
#     enable_categorical=True,         # turns on native categorical handling
#     eval_metric="auc",
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=8,
#     subsample=0.8,
#     colsample_bytree=0.8
#     # optional: n_estimators=500, learning_rate=0.05, max_depth=8, subsample=0.8, colsample_bytree=0.8
# )

# xgb_train_clf.fit(X_train, y_train)

# # print features and importance
# features = X.columns
# feature_types = X.dtypes
# importances = xgb_train_clf.feature_importances_
# feature_importance_df = pd.DataFrame({'Feature': features, 'Type': feature_types, 'Importance': importances})
# feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [34]:
# feature_importance_df.head(20)

In [35]:
# from sklearn.metrics import classification_report

# # Predict hard labels
# y_pred = xgb_train_clf.predict(X_val)

# # Print the full report
# print(classification_report(y_val, y_pred, digits=4))

In [36]:
# from sklearn.metrics import roc_auc_score

# # Predict probabilities for the positive class (churn=1)
# y_prob = xgb_train_clf.predict_proba(X_val)[:, 1]

# # Calculate AUC score
# auc = roc_auc_score(y_val, y_prob)

# print(f"AUC: {auc:.4f}")

In [None]:
# # === imports ===
# import pandas as pd
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV
# from sklearn.metrics import make_scorer, average_precision_score
# from xgboost import XGBClassifier

clf = XGBClassifier(
    device='gpu',
    tree_method="hist",
    enable_categorical=True,
    objective="binary:logistic",
    eval_metric="auc",
    random_state=42,
    n_estimators=500,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8
)

pipe = Pipeline([
    ("clf", clf),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "roc_auc": "roc_auc",
    "pr_auc": "average_precision",
    "accuracy": "accuracy",
}

cv_results = cross_validate(
    pipe, X, y, cv=cv, scoring=scoring, n_jobs=-1, return_estimator=False
)

# print(
#     f"ROC-AUC: {cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}\n"
#     f"PR-AUC : {cv_results['test_pr_auc'].mean():.4f} ± {cv_results['test_pr_auc'].std():.4f}\n"
#     f"ACC    : {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}"
# )

# # # Hyperparameter tuning for native categoricals
# # param_dist = {
# #     "clf__n_estimators": [400, 600, 800, 1000],
# #     "clf__learning_rate": [0.01, 0.03, 0.05, 0.1],
# #     "clf__max_depth": [4, 6, 8, 10]
# #     # "clf__min_child_weight": [1, 2, 5, 8],
# #     # "clf__subsample": [0.7, 0.8, 0.9, 1.0],
# #     # "clf__colsample_bytree": [0.7, 0.8, 0.9, 1.0],
# #     # "clf__gamma": [0, 1, 2],
# #     # "clf__reg_alpha": [0, 0.01, 0.1, 1],
# #     # "clf__reg_lambda": [0.5, 1, 2, 5],
# # }

# # rs = RandomizedSearchCV(
# #     estimator=pipe,
# #     param_distributions=param_dist,
# #     n_iter=30,
# #     scoring="roc_auc",
# #     cv=cv,
# #     n_jobs=-1,
# #     verbose=1,
# #     random_state=42,
# #     refit=True,
# # )

# # rs.fit(X, y)
# # print("Best ROC-AUC (CV):", rs.best_score_)
# # print("Best params:")
# # for k, v in rs.best_params_.items():
# #     print(f"  {k}: {v}")

# # best_model = rs.best_estimator_
