# XGBoost Base Model

In [None]:
# pip install xgboost
# pip install --upgrade xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import mlflow
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope


# warnings.filterwarnings("ignore")

# Read Data and Setup MLFLow

In [None]:
# read input data
churn = pd.read_csv("../data/churn.txt")
pd.set_option("display.max_columns", 500)

In [None]:
churn.shape

# Data Analysis

By modern standards, it’s a relatively small dataset, with only 5,000 records, where each record uses 21 attributes to describe the profile of a customer of an unknown US mobile operator. The attributes are:

- `State`: the US state in which the customer resides, indicated by a two-letter abbreviation; for example, OH or NJ
- `Account Length`: the number of days that this account has been active
- `Area Code`: the three-digit area code of the corresponding customer’s phone number
- `Phone`: the remaining seven-digit phone number
- `Int’l Plan`: whether the customer has an international calling plan: yes/no
- `VMail Plan`: whether the customer has a voice mail feature: yes/no
- `VMail Message`: the average number of voice mail messages per month
- `Day Mins`: the total number of calling minutes used during the day
- `Day Calls`: the total number of calls placed during the day
- `Day Charge`: the billed cost of daytime calls
- `Eve Mins, Eve Calls, Eve Charge`: the billed cost for calls placed during the evening
- `Night Mins`, `Night Calls`, `Night Charge`: the billed cost for calls placed during nighttime
- `Intl Mins`, `Intl Calls`, `Intl Charge`: the billed cost for international calls
- `CustServ Calls`: the number of calls placed to Customer Service
- `Churn?`: whether the customer left the service: true/false

The last attribute, `Churn?`, is known as the target attribute: the attribute that we want the ML model to predict.  Because the target attribute is binary, our model will be performing binary prediction, also known as binary classification.

Let's begin exploring the data:

In [None]:
# Frequency tables for each categorical feature
for column in churn.select_dtypes(include=["object"]).columns:
    display(pd.crosstab(index=churn[column], columns="% observations", normalize="columns"))

# Histograms for each numeric features
display(churn.describe())
%matplotlib inline
hist = churn.hist(bins=30, sharey=True, figsize=(10, 10))

We can see immediately that:
- `State` appears to be quite evenly distributed.
- `Phone` takes on too many unique values to be of any practical use.  It's possible that parsing out the prefix could have some value, but without more context on how these are allocated, we should avoid using it.
- Most of the numeric features are surprisingly nicely distributed, with many showing bell-like `gaussianity`.  `VMail Message` is a notable exception (and `Area Code` showing up as a feature we should convert to non-numeric).

In [None]:
churn = churn.drop("Phone", axis=1)
churn["Area Code"] = churn["Area Code"].astype(object)


Next let's look at the relationship between each of the features and our target variable.

In [None]:
for column in churn.select_dtypes(include=["object"]).columns:
    if column != "Churn?":
        display(pd.crosstab(index=churn[column], columns=churn["Churn?"], normalize="columns"))

for column in churn.select_dtypes(exclude=["object"]).columns:
    print(column)
    hist = churn[[column, "Churn?"]].hist(by="Churn?", bins=30)
    plt.show()

We see several features that essentially have 100% correlation with one another.  Including these feature pairs in some machine learning algorithms can create catastrophic problems, while in others it will only introduce minor redundancy and bias.  Let's remove one feature from each of the highly correlated pairs: `Day Charge` from the pair with `Day Mins`, `Night Charge` from the pair with `Night Mins`, `Intl Charge` from the pair with `Intl Mins`:

In [None]:
churn = churn.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

In [None]:
churn.head()

# Train Model

In [None]:
# Extract feature and target arrays
X, y = churn.drop('Churn?', axis=1), churn[['Churn?']]

In [None]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [None]:
X.dtypes

In [None]:
y['Churn?'] = y['Churn?'].replace({'True.': 1, 'False.': 0})

In [None]:
y.dtypes

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
model = XGBClassifier(eval_metric='mlogloss', tree_method="hist", enable_categorical=True)


In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

**Using DMatrix**

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

In [None]:
params = model.get_params()
print(params)

In [None]:
# # specify parameters via map
# params = {'max_depth': None, 'eval_metric' : 'mlogloss', 'tree_method' : "hist", 'eta':1, 'objective':'binary:logistic'}
# num_round = 2

params = model.get_params()
booster = xgb.train(
    params=params,
    dtrain=dtrain
)


# booster = xgb.train(
#     params=params,
#     dtrain=dtrain,
#     num_boost_round=1000,
#     evals=[(dtest,"test")],
#     early_stopping_rounds=50
# )

In [None]:
# make prediction
y_pred_prob = booster.predict(dtest)
y_pred = (y_pred_prob >= 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)

# Define the loss as the negative value of accuracy for minimization
loss = -accuracy

In [None]:
print(accuracy)

In [None]:
y_pred

In [None]:
# specify parameters via map
params = {'max_depth': None, 'eval_metric' : 'mlogloss', 'tree_method' : "hist", 'eta':1, 'objective':'binary:logistic'}
num_round = 2

# train model 
def objective(params):
    
    with mlflow.start_run():

        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            evals=[(dtest,"test")],
            early_stopping_rounds=50
        )

        # make prediction
        y_pred_prob = booster.predict(dtest)
        y_pred = (y_pred_prob >= 0.5).astype(int)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.autolog()

        # Define the loss as the negative value of accuracy for minimization
        loss = -accuracy

        return {'loss': loss, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

# Selecting the best model

In [None]:
# with mlflow.start_run():

best_params = {
    'learning_rate'	: 0.2611886716276454,
    'max_depth' : 39,
    'min_child_weight' : 4.490391995734931,
    'objective' : 'binary:logistic',
    'reg_alpha' : 0.044567672488398144,
    'reg_lambda' : 0.11968534468462336,
    'seed' : 42
}

mlflow.log_params(best_params)

booster = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=[(dtest,"test")],
    early_stopping_rounds=50
)

# make prediction
y_pred_prob = booster.predict(dtest)
y_pred = (y_pred_prob >= 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)

# Define the loss as the negative value of accuracy for minimization
loss = -accuracy

   
