In [40]:
import joblib as jb

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from skopt import gp_minimize
from sklearn.metrics import f1_score
from sklearn.preprocessing import MaxAbsScaler

In [42]:
train_data = jb.load('train_data.pkl')

In [43]:
test_data = jb.load('test_data.pkl')

In [76]:
X_test = test_data.drop('Response', axis=1)
y_test = test_data['Response']

In [65]:
X_train, X_val, y_train, y_val = train_test_split(train_data.drop('Response', axis=1), train_data['Response'], train_size=0.9, random_state=23)

In [56]:
scaler = MaxAbsScaler()

In [66]:
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [77]:
X_test = scaler.fit_transform(X_test)

In [68]:
# Baseline

In [69]:
pred = test_data.apply(lambda x: 1 if x['AcceptedCmpIndicator'] > 0 and x['TotalCost'] > 500 else 0, axis=1)

In [70]:
f1_score(pred, test_data['Response'])

0.43749999999999994

In [71]:
# SVM

In [72]:
from sklearn.svm import SVC

In [73]:
def tune_svc(params):
    c_value, gamma = params
    mdl = SVC(C=c_value, gamma=gamma, kernel='rbf')
    mdl.fit(X_train, y_train)
    pred = mdl.predict(X_val)
    value = f1_score(pred, y_val)
    return -value

In [80]:
space = [(0.01, 100), (1, 100)]

In [89]:
res = gp_minimize(tune_svc, space, n_initial_points=20, random_state=67, n_calls=100, verbose=True)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.2046
Function value obtained: -0.0667
Current minimum: -0.0667
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2642
Function value obtained: -0.1250
Current minimum: -0.1250
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1113
Function value obtained: -0.2162
Current minimum: -0.2162
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.2061
Function value obtained: -0.1290
Current minimum: -0.2162
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.2215
Function value obtained: -0.1818
Current minimum: -0.2162
Iteration No: 6 started. 

Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 1.1583
Function value obtained: -0.3478
Current minimum: -0.4762
Iteration No: 43 started. Searching for the next optimal point.
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 1.4819
Function value obtained: -0.2500
Current minimum: -0.4762
Iteration No: 44 started. Searching for the next optimal point.
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 1.2680
Function value obtained: -0.3478
Current minimum: -0.4762
Iteration No: 45 started. Searching for the next optimal point.
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 1.1955
Function value obtained: -0.3478
Current minimum: -0.4762
Iteration No: 46 started. Searching for the next optimal point.
Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 1.1949
Function value obtained: -0.3333
Current minimum: -0.4762
Iteration No: 47 st



Iteration No: 66 ended. Search finished for the next optimal point.
Time taken: 1.6164
Function value obtained: -0.3243
Current minimum: -0.4762
Iteration No: 67 started. Searching for the next optimal point.
Iteration No: 67 ended. Search finished for the next optimal point.
Time taken: 1.5890
Function value obtained: -0.3333
Current minimum: -0.4762
Iteration No: 68 started. Searching for the next optimal point.
Iteration No: 68 ended. Search finished for the next optimal point.
Time taken: 2.8198
Function value obtained: -0.3243
Current minimum: -0.4762
Iteration No: 69 started. Searching for the next optimal point.
Iteration No: 69 ended. Search finished for the next optimal point.
Time taken: 1.3543
Function value obtained: -0.3333
Current minimum: -0.4762
Iteration No: 70 started. Searching for the next optimal point.
Iteration No: 70 ended. Search finished for the next optimal point.
Time taken: 1.8726
Function value obtained: -0.3243
Current minimum: -0.4762
Iteration No: 71 st



Iteration No: 72 ended. Search finished for the next optimal point.
Time taken: 1.6349
Function value obtained: -0.3243
Current minimum: -0.4762
Iteration No: 73 started. Searching for the next optimal point.
Iteration No: 73 ended. Search finished for the next optimal point.
Time taken: 1.6533
Function value obtained: -0.3333
Current minimum: -0.4762
Iteration No: 74 started. Searching for the next optimal point.
Iteration No: 74 ended. Search finished for the next optimal point.
Time taken: 1.5539
Function value obtained: -0.3243
Current minimum: -0.4762
Iteration No: 75 started. Searching for the next optimal point.
Iteration No: 75 ended. Search finished for the next optimal point.
Time taken: 1.8414
Function value obtained: -0.3478
Current minimum: -0.4762
Iteration No: 76 started. Searching for the next optimal point.
Iteration No: 76 ended. Search finished for the next optimal point.
Time taken: 2.0085
Function value obtained: -0.3478
Current minimum: -0.4762
Iteration No: 77 st



Iteration No: 90 ended. Search finished for the next optimal point.
Time taken: 2.2847
Function value obtained: -0.3243
Current minimum: -0.5306
Iteration No: 91 started. Searching for the next optimal point.




Iteration No: 91 ended. Search finished for the next optimal point.
Time taken: 1.9177
Function value obtained: -0.3243
Current minimum: -0.5306
Iteration No: 92 started. Searching for the next optimal point.




Iteration No: 92 ended. Search finished for the next optimal point.
Time taken: 2.0938
Function value obtained: -0.3243
Current minimum: -0.5306
Iteration No: 93 started. Searching for the next optimal point.




Iteration No: 93 ended. Search finished for the next optimal point.
Time taken: 2.0888
Function value obtained: -0.3243
Current minimum: -0.5306
Iteration No: 94 started. Searching for the next optimal point.




Iteration No: 94 ended. Search finished for the next optimal point.
Time taken: 2.8955
Function value obtained: -0.3243
Current minimum: -0.5306
Iteration No: 95 started. Searching for the next optimal point.
Iteration No: 95 ended. Search finished for the next optimal point.
Time taken: 2.8611
Function value obtained: -0.4615
Current minimum: -0.5306
Iteration No: 96 started. Searching for the next optimal point.
Iteration No: 96 ended. Search finished for the next optimal point.
Time taken: 2.3762
Function value obtained: -0.3830
Current minimum: -0.5306
Iteration No: 97 started. Searching for the next optimal point.
Iteration No: 97 ended. Search finished for the next optimal point.
Time taken: 2.2749
Function value obtained: -0.3478
Current minimum: -0.5306
Iteration No: 98 started. Searching for the next optimal point.
Iteration No: 98 ended. Search finished for the next optimal point.
Time taken: 2.6975
Function value obtained: -0.3478
Current minimum: -0.5306
Iteration No: 99 st

In [124]:
mdl = SVC(C=res.x[0], gamma=res.x[1], kernel='rbf', probability=True)

In [125]:
mdl.fit(X_train, y_train)

SVC(C=100.0, gamma=1.3162195289409921, probability=True)

In [130]:
pred = mdl.predict(X_test)

In [131]:
pred_proba = mdl.predict_proba(X_test)

In [132]:
pred

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,

In [134]:
pred_proba[:, 1]

array([0.27312297, 0.43099103, 0.04030256, 0.23896099, 0.10366019,
       0.12920237, 0.0792191 , 0.09096659, 0.11432664, 0.06695305,
       0.15232818, 0.25730574, 0.03435309, 0.06436964, 0.38013372,
       0.27911808, 0.10061213, 0.13772957, 0.09893721, 0.00654055,
       0.26853961, 0.08102855, 0.13351252, 0.12209723, 0.23018147,
       0.10098071, 0.08299465, 0.12186724, 0.91745749, 0.03063482,
       0.21892437, 0.00894445, 0.1851449 , 0.74770024, 0.97345903,
       0.11738191, 0.15109019, 0.04830877, 0.22795667, 0.06799476,
       0.23474083, 0.02880381, 0.21111112, 0.79848698, 0.24049784,
       0.08332494, 0.06307322, 0.10463719, 0.11298909, 0.02506169,
       0.01351658, 0.0777311 , 0.1052843 , 0.11238765, 0.07136304,
       0.13993494, 0.10390983, 0.38175025, 0.09691702, 0.27268093,
       0.18915877, 0.13484959, 0.31115827, 0.573588  , 0.10811385,
       0.1751626 , 0.13644363, 0.05023155, 0.37338053, 0.07134427,
       0.1036116 , 0.17244937, 0.03419625, 0.75169541, 0.18473

In [93]:
f1_score(pred, y_test)

0.3773584905660377

In [None]:
# xgboost

In [94]:
import xgboost as xgb

In [118]:
reg = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=2000,
                          silent=True, booster='gbtree', objective='binary:logistic')

reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
       verbose=True)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-logloss:0.62855	validation_1-logloss:0.63615
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.57504	validation_1-logloss:0.59047
[2]	validation_0-logloss:0.53079	validation_1-logloss:0.55297
[3]	validation_0-logloss:0.49273	validation_1-logloss:0.52133
[4]	validation_0-logloss:0.46025	validation_1-logloss:0.49450
[5]	validation_0-logloss:0.43148	validation_1-logloss:0.47071
[6]	validation_0-logloss:0.40704	validation_1-logloss:0.45383
[7]	validation_0-logloss:0.38486	validation_1-logloss:0.43625
[8]	validation_0-logloss:0.36491	validation_1-logloss:0.4

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=2000, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [119]:
pred = reg.predict(X_test)
pred_proba = reg.predict_proba(X_test)

In [122]:
f1_score(pred, y_test)

0.5864661654135338

In [136]:
test_data['pred_proba'] = pred_proba[:, 1]

In [145]:
test_data.sort_values(by='pred_proba', ascending=False).head(10)

Unnamed: 0,AcceptedCmpIndicator,TotalCost,NumIndicator,Education,Marital_Status,IndicatorChildren,Income,Year_Birth,Dt_Customer,Recency,Response,pred_proba
1559,0.0,1034,13,4,1,0,70179.0,2,2013,10,0,0.976526
2057,0.0,1034,13,4,1,0,70179.0,2,2013,10,0,0.976526
1914,0.0,1198,20,4,1,0,52278.0,2,2013,24,0,0.973459
15,3.0,1315,19,4,1,1,82800.0,5,2012,23,1,0.9635
1308,6.0,1189,18,2,0,1,65169.0,3,2014,23,1,0.955666
87,2.0,372,8,2,1,0,50388.0,1,2014,3,1,0.947129
278,1.5,925,13,4,1,1,69867.0,3,2013,30,1,0.941801
1955,0.0,1130,17,4,1,1,76842.0,1,2014,37,0,0.940281
1814,0.0,1338,13,2,1,0,72071.0,1,2013,4,0,0.927573
407,0.0,272,7,4,1,0,65808.0,2,2014,1,0,0.920378


In [None]:
test_data['pred_proba'].sort_value

In [None]:
test_data['pred_proba'].sort_values