## Import tab separated data

In [8]:
import pandas as pd
from sklearn.metrics import auc, accuracy_score
import xgboost as xgb

In [2]:
data = pd.read_csv("finalDataSet.csv")

In [3]:
data.head()

Unnamed: 0,Education,Income,Kidhome,Marital_Status,NumDealsPurchases,Recency,Teenhome,Year_Birth,Succesful_cmp,Months_After_2012
0,2,0.288658,0,0,3,58,0,1957,0,9
1,2,-0.262293,1,0,2,38,1,1954,0,27
2,2,0.918137,0,2,1,26,0,1965,0,20
3,2,-1.182477,1,2,2,26,0,1984,0,26
4,4,0.295899,1,3,5,94,0,1981,0,25


Classic Model

In [4]:
X=data[['Education', 'Income', 'Kidhome', 'Marital_Status','NumDealsPurchases','Recency', 'Teenhome', 'Year_Birth', 'Months_After_2012']]
y=data['Succesful_cmp']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)

In [9]:
xgb_model = xgb.XGBClassifier(colsample_bytree=0.7, grow_policy="lossguide", max_bin=255, max_depth=6, objective="reg:logistic", n_estimators=100, reg_lambda=1.875, tree_method="hist")

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 82.59%


In [14]:
from sklearn.metrics import log_loss, roc_auc_score, f1_score
predP = xgb_model.predict_proba(X_test)

print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))
print("log_loss:",log_loss(y_test, predP))
print("AUC:",roc_auc_score(y_test, predP, multi_class="ovr"))

F1-Score: 0.7885061801373722
log_loss: 0.7586930014444988
AUC: 0.7955986958550166


In [15]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[353   3   0   1   3   2]
 [ 24   8   0   0   0   2]
 [  1   0   1   0   0   0]
 [ 22   0   0   5   0   1]
 [ 11   0   0   0   2   0]
 [  7   0   0   0   1   1]]
              precision    recall  f1-score   support

           0       0.84      0.98      0.91       362
           1       0.73      0.24      0.36        34
           2       1.00      0.50      0.67         2
           3       0.83      0.18      0.29        28
           4       0.33      0.15      0.21        13
           5       0.17      0.11      0.13         9

    accuracy                           0.83       448
   macro avg       0.65      0.36      0.43       448
weighted avg       0.81      0.83      0.79       448



Oversampled  Model

In [16]:
testSet = pd.read_csv("test_set.csv")
train_oversampled = pd.read_csv("train_oversampled.csv")

In [152]:
testSet.head()

Unnamed: 0,Education,Income,Kidhome,Marital_Status,NumDealsPurchases,Recency,Teenhome,Year_Birth,Months_After_2012,Succesful_cmp
0,3,-1.742351,0,2,1,88,0,1956,13,0
1,3,-1.626359,1,2,5,96,1,1960,26,0
2,2,0.869227,0,0,2,4,1,1968,22,0
3,4,0.468883,0,3,3,27,1,1977,13,1
4,4,0.030607,0,0,2,63,1,1975,12,0


In [17]:
X_train_oversampled=train_oversampled[['Education', 'Income', 'Kidhome', 'Marital_Status','NumDealsPurchases','Recency', 'Teenhome', 'Year_Birth', 'Months_After_2012']]

y_train_oversampled=train_oversampled['Succesful_cmp']

X_testSet=testSet[['Education', 'Income', 'Kidhome', 'Marital_Status','NumDealsPurchases','Recency', 'Teenhome', 'Year_Birth', 'Months_After_2012']]

y_testSet=testSet['Succesful_cmp']

In [18]:
xgb_model = xgb.XGBClassifier(colsample_bytree=0.7, grow_policy="lossguide", max_bin=255, max_depth=6, objective="reg:logistic", n_estimators=100, reg_lambda=1.875, tree_method="hist")

xgb_model.fit(X_train_oversampled, y_train_oversampled)

y_predOver = xgb_model.predict(X_testSet)
accuracy = accuracy_score(y_testSet, y_predOver)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 68.97%


In [19]:
from sklearn.metrics import log_loss, roc_auc_score, f1_score
pred = xgb_model.predict_proba(X_testSet)

print("F1-Score:", f1_score(y_testSet, y_predOver, average='weighted'))
print("log_loss:",log_loss(y_testSet, pred))
print("AUC:",roc_auc_score(y_testSet, pred, multi_class="ovr"))

F1-Score: 0.7172881087433937
log_loss: 1.0473751036692778
AUC: 0.7819349076432593


In [20]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_testSet,y_predOver))
print(classification_report(y_testSet,y_predOver))

[[284  15   2  27  27   7]
 [ 11  15   0   0   5   3]
 [  0   0   1   1   0   0]
 [ 19   1   0   6   1   1]
 [ 10   0   0   1   2   0]
 [  3   3   1   0   1   1]]
              precision    recall  f1-score   support

           0       0.87      0.78      0.82       362
           1       0.44      0.44      0.44        34
           2       0.25      0.50      0.33         2
           3       0.17      0.21      0.19        28
           4       0.06      0.15      0.08        13
           5       0.08      0.11      0.10         9

    accuracy                           0.69       448
   macro avg       0.31      0.37      0.33       448
weighted avg       0.75      0.69      0.72       448

