The two most popular classification objectives are:

    binary:logistic - binary classification (the target contains only two classes, i.e., cat or dog)

    multi:softprob - multi-class classification (more than two classes in the target, i.e., apple/orange/banana)


In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import warnings

## Columns definitions

**start_days** is number of days from the first session

**created_date** is a date of the first session of a contact

a **session** is a number of pageviews during a 30min windows

which means if you stop navigating for 30min and start again it will be a second session

In [15]:
target = "paying" # It is what we try to predict

# a file was generated for 'session_59d'
df = pd.read_csv("/vagrant/ai_random_forest_py/contacts/202402070946/segments.csv", low_memory=False) # returns DataFrame

# del df["created_date"]
# del df["id"]

# 1 if an entity is in a segment at the end of a period
# 0 (zero) if an entity is not in a segment at the end of a period
# 0 (zero) if an entity has never been in a segment

# Viewing the top 5 rows
df.head()

Unnamed: 0,created_date,id,segm_1,segm_2,segm_3,segm_4,segm_5,segm_6,segm_7,segm_8,...,segm_210,segm_211,segm_212,segm_213,segm_214,segm_215,segm_216,segm_217,segm_218,paying
0,2019-06-18,1,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2019-06-18,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2019-06-18,3,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2019-06-18,4,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2019-06-18,5,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
del df["created_date"]
del df["id"]

df.head()

Unnamed: 0,segm_1,segm_2,segm_3,segm_4,segm_5,segm_6,segm_7,segm_8,segm_9,segm_10,...,segm_210,segm_211,segm_212,segm_213,segm_214,segm_215,segm_216,segm_217,segm_218,paying
0,0,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Create a list of the feature column's names


# print(df.shape[1] - 2)
features = df.columns[:(df.shape[1] - 1)]


# View features
print(f'Features: {features}')

df.head()

Features: Index(['segm_1', 'segm_2', 'segm_3', 'segm_4', 'segm_5', 'segm_6', 'segm_7',
       'segm_8', 'segm_9', 'segm_10',
       ...
       'segm_209', 'segm_210', 'segm_211', 'segm_212', 'segm_213', 'segm_214',
       'segm_215', 'segm_216', 'segm_217', 'segm_218'],
      dtype='object', length=218)


Unnamed: 0,segm_1,segm_2,segm_3,segm_4,segm_5,segm_6,segm_7,segm_8,segm_9,segm_10,...,segm_210,segm_211,segm_212,segm_213,segm_214,segm_215,segm_216,segm_217,segm_218,paying
0,0,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.model_selection import train_test_split

# Split the data
# x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, stratify=y)
x_train, x_test, y_train, y_test = train_test_split(df, df['paying'], test_size=0.3, random_state = 2020, stratify = df['paying'])

In [6]:
print('data: ', df.shape[0])

print('Traning data')
print('Paying True: ', x_train[x_train['paying'] == True].shape[0])
print('Paying False: ', x_train[x_train['paying'] == False].shape[0])

print('')

print('Testing data')
print('Paying True: ', x_test[x_test['paying'] == True].shape[0])
print('Paying False', x_test[x_test['paying'] == False].shape[0])

data:  2096
Traning data
Paying True:  25
Paying False:  1442

Testing data
Paying True:  11
Paying False 618


We have **Unbalanced Data**

An unbalanced dataset is one in which the target variable has more observations in one specific class than the others.

We will play around it in another script


In [7]:
x_train = x_train[features]
x_test = x_test[features]

In [8]:
y_train = y_train.to_numpy().reshape(-1, 1)
y_test = y_test.to_numpy().reshape(-1, 1)

y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [9]:
x_train

Unnamed: 0,segm_1,segm_2,segm_3,segm_4,segm_5,segm_6,segm_7,segm_8,segm_9,segm_10,...,segm_209,segm_210,segm_211,segm_212,segm_213,segm_214,segm_215,segm_216,segm_217,segm_218
1165,0,0,0,1,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1953,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
862,0,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
106,0,0,0,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1925,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1867,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2041,0,0,0,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
141,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1341,0,0,0,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
import xgboost as xgb

# Create classification matrices
dtrain_clf = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(x_test, label=y_test, enable_categorical=True)

print('y_train lenght', len(y_train))
print('label:', dtrain_clf.get_label())

print('len label:', len(dtrain_clf.get_label()))

print('the number of columns (features)', dtrain_clf.num_col())

print('the number of rows', dtrain_clf.num_row())

data = pd.DataFrame(np.arange(12).reshape((4,3)), columns=['a', 'b', 'c'])
print(data)

label = pd.DataFrame(np.random.randint(2, size=4))
print(label)
dtrain = xgb.DMatrix(data, label=label)
dtrain

y_train lenght 1467
label: [0. 0. 0. ... 0. 0. 0.]
len label: 1467
the number of columns (features) 218
the number of rows 1467
   a   b   c
0  0   1   2
1  3   4   5
2  6   7   8
3  9  10  11
   0
0  0
1  1
2  1
3  1


<xgboost.core.DMatrix at 0x7f2a839fa710>

In [11]:
# # Define hyperparameters
# params = {"objective": "binary:logistic", "tree_method": "hist"}

# n = 100
# model = xgb.train(
#    params=params,
#    dtrain=dtrain_clf,
#    num_boost_round=n
# )

# y_pred = model.predict(dtest_clf)

# # It returns array of float data type
# # https://blog.clearbrain.com/posts/how-to-predict-yesno-outcomes-using-logistic-regression
# y_pred
# # y_test

In [12]:
# https://xgboost.readthedocs.io/en/latest/python/python_intro.html#scikit-learn-interface
# https://xgboost.readthedocs.io/en/latest/python/sklearn_estimator.html

# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=5)

# Fit the model, test sets are used for early stopping.
clf.fit(x_train, y_train, eval_set=[(x_test, y_test)])

[0]	validation_0-logloss:0.12989
[1]	validation_0-logloss:0.10199
[2]	validation_0-logloss:0.08166
[3]	validation_0-logloss:0.06766
[4]	validation_0-logloss:0.05783
[5]	validation_0-logloss:0.04971
[6]	validation_0-logloss:0.04412
[7]	validation_0-logloss:0.04032
[8]	validation_0-logloss:0.03770
[9]	validation_0-logloss:0.03615
[10]	validation_0-logloss:0.03408
[11]	validation_0-logloss:0.03329
[12]	validation_0-logloss:0.03258
[13]	validation_0-logloss:0.03174
[14]	validation_0-logloss:0.03142
[15]	validation_0-logloss:0.03106
[16]	validation_0-logloss:0.03027
[17]	validation_0-logloss:0.03032
[18]	validation_0-logloss:0.03104
[19]	validation_0-logloss:0.03084
[20]	validation_0-logloss:0.03086
[21]	validation_0-logloss:0.03061


In [13]:
y_pred = clf.predict(x_test)
print(len(y_pred))
print(len(y_test.reshape(-1)))


629
629


### R^2 (coefficient of determination) regression score function.

In [14]:
from sklearn.metrics import r2_score

# sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', force_finite=True)
print('R^2: ', r2_score(y_test.reshape(-1), y_pred))

R^2:  0.35230950279493956


Let's see if we can improve a coefficient of determination