In [51]:
import os, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve, accuracy_score

In [2]:
dfoff = pd.read_csv('train_offline.csv')
dftest = pd.read_csv('test_offline.csv')
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [3]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [4]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [5]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [6]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

In [7]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [72]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)
predictors = original_feature

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [9]:
train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,is_train
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,1,0,0,0,0,0.95,20,1,1,True
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0,...,0,0,0,1,0,0.95,20,1,1,True
2,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,1,0,0,0.9,200,20,1,True
3,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0,...,0,0,1,0,0,0.5,10,5,1,True
4,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7.0,0,...,0,0,0,0,1,0.9,100,10,1,True


In [10]:
train.dtypes

User_id            int64
Merchant_id        int64
Coupon_id        float64
Discount_rate     object
Distance         float64
Date_received    float64
Date             float64
label              int64
weekday          float64
weekday_type       int64
weekday_1          uint8
weekday_2          uint8
weekday_3          uint8
weekday_4          uint8
weekday_5          uint8
weekday_6          uint8
weekday_7          uint8
discount_rate    float64
discount_man       int64
discount_jian      int64
discount_type      int64
is_train            bool
dtype: object

In [11]:
'''Set a seed for reproducibility'''
seed = 43
dropcolumns=['User_id','Merchant_id','Coupon_id','Discount_rate','Date_received','Date','is_train']
df_train = train.drop(columns = dropcolumns, axis = 1)
df_test = valid.drop(columns = dropcolumns, axis = 1)

In [12]:
'''Extract data sets as input and output for machine learning models.'''
X_train = df_train.drop(columns = ['label'], axis = 1) 
y_train = df_train['label']

"""Extract test set"""
X_test  = df_test.drop(columns = ['label'], axis = 1).copy()

'''See the dimensions of input and output data set.'''
print('Input Matrix Dimension:  ', X_train.shape)
print('Output Vector Dimension: ', y_train.shape)
print('Test Data Dimension:     ', X_test.shape)

Input Matrix Dimension:   (667753, 14)
Output Vector Dimension:  (667753,)
Test Data Dimension:      (79216, 14)


In [None]:
'''#1.Logistic Regression'''
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

'''#3.Random Forest Classifier'''
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = seed, n_estimators = 100)

'''#4.KNN'''
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

'''#6.Decision Tree Classifier'''
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = seed)

'''#7.Gradient Boosting Classifier'''
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state = seed)

'''#10.Extreme Gradient Boosting'''
from xgboost import XGBClassifier
xgbc = XGBClassifier(random_state = seed, tree_method = 'gpu_hist')

In [14]:
'''Create a function that returns train accuracy of different models.'''
def train_accuracy(model):
    start = time.time()
    model.fit(X_train, y_train)
    train_accuracy = model.score(X_train, y_train)
    train_accuracy = np.round(train_accuracy*100, 2)
    end = time.time()
    print(end-start)
    return train_accuracy

In [30]:
'''Models with best training accuracy:'''
train_accuracy = pd.DataFrame({'Train_accuracy(%)':[train_accuracy(lr), train_accuracy(rf), train_accuracy(knn), train_accuracy(dt), train_accuracy(gbc), train_accuracy(xgbc)]})
train_accuracy.index = ['LR', 'RF', 'KNN', 'DT', 'GBC', 'XGBC']
sorted_train_accuracy = train_accuracy.sort_values(by = 'Train_accuracy(%)', ascending = False)



4.781664609909058
70.06777453422546
567.1314775943756
2.0626137256622314
59.442872285842896
44.190572023391724


In [31]:
sorted_train_accuracy

Unnamed: 0,Train_accuracy(%)
LR,95.14
RF,95.14
DT,95.14
GBC,95.14
XGBC,95.14
KNN,95.0


In [32]:
'''Create a function that returns mean cross validation score for different models.'''
def x_val_score(model):
    from sklearn.model_selection import cross_val_score
    x_val_score = cross_val_score(model, X_train, y_train, cv = 5, scoring = 'accuracy').mean()
    x_val_score = np.round(x_val_score*100, 2)
    return x_val_score

"""Let's perform k-fold (k=5) cross validation to find the classifier with the best cross validation accuracy."""
x_val_score = pd.DataFrame({'X_val_score(%)':[x_val_score(lr), x_val_score(rf), x_val_score(knn), x_val_score(dt), x_val_score(gbc), x_val_score(xgbc)]})
x_val_score.index = ['LR', 'RF', 'KNN', 'DT', 'GBC', 'XGBC']
sorted_x_val_score = x_val_score.sort_values(by = 'X_val_score(%)', ascending = False) 
sorted_x_val_score



Unnamed: 0,X_val_score(%)
LR,95.14
GBC,95.14
XGBC,95.14
DT,95.13
RF,95.12
KNN,94.63


In [122]:
"""Define all the models' hyperparameters one by one first::"""

'''Define hyperparameters the logistic regression will be tuned with. For LR, the following hyperparameters are usually tunned.'''
lr_params = {'penalty':['l1', 'l2'],
             'C': np.logspace(0, 4, 10)}

'''For GBC, the following hyperparameters are usually tunned.'''
gbc_params = {'learning_rate': [0.01, 0.02, 0.05],
              'max_depth': [4, 6],
              'max_features': [1.0, 0.3], 
              'min_samples_split': [ 2, 3],
              'random_state':[seed]}


'''For DT, the following hyperparameters are usually tunned.'''
dt_params = {'max_features': ['auto', 'sqrt', 'log2'],
             'min_samples_split': [2, 3, 4, 8, 9, 10], 
             'min_samples_leaf':[1, 2, 5, 6, 9, 10],
             'random_state':[seed]}

'''For RF, the following hyperparameters are usually tunned.'''
rf_params = {'criterion':['gini','entropy'],
             'n_estimators':[10, 15, 20, 25, 30],
             'min_samples_leaf':[1, 2, 3],
             'min_samples_split':[3, 4, 5, 6, 7], 
             'max_features':['sqrt', 'auto', 'log2'],
             'random_state':[44]}

'''For KNN, the following hyperparameters are usually tunned.'''
knn_params = {'n_neighbors':[3, 5, 7],
              'leaf_size':[1, 3, 5],
              'weights':['uniform', 'distance'],
              'algorithm':['auto', 'ball_tree','kd_tree','brute']}


'''For XGBC, the following hyperparameters are usually tunned.'''
xgbc_params = {'n_estimators': (150, 550, 850),
              'learning_rate': (0.01, 0.6),
              'subsample': (0.3, 0.9),
              'max_depth': [3, 5, 7],
              'colsample_bytree': (0.5, 0.9),
              'min_child_weight': [1, 3],
              'random_state':[seed]}


'''Create a function to tune hyperparameters of the selected models.'''
def tune_hyperparameters(model, params):
    from sklearn.model_selection import GridSearchCV
    global best_params, best_score
    # Construct grid search object with 10 fold cross validation.
    grid = GridSearchCV(model, params, verbose = 10, cv = 3, scoring = 'accuracy', n_jobs = -1)
    # Fit using grid search.
    grid.fit(X_train, y_train)
    best_params, best_score = grid.best_params_, np.round(grid.best_score_*100, 2)
    return best_params, best_score

In [38]:
# '''Tune LR hyperparameters.'''
# tune_hyperparameters(lr, params = lr_params)
# lr_best_params, lr_best_score = best_params, best_score
# print('LR Best Score:', lr_best_score)
# print('And Best Parameters:', lr_best_params)

# """Tune GBC's hyperparameters."""
# tune_hyperparameters(gbc, params = gbc_params)
# gbc_best_score, gbc_best_params = best_score, best_params
# print('GBC Best Score:', gbc_best_score)
# print('And Best Parameters:', gbc_best_params)

# """Tune DT's hyperparameters."""
# tune_hyperparameters(dt, params = dt_params)
# dt_best_score, dt_best_params = best_score, best_params
# print('DT Best Score:', dt_best_score)
# print('And Best Parameters:', dt_best_params)

# """Tune RF's hyperparameters."""
# tune_hyperparameters(rf, params = rf_params)
# rf_best_score, rf_best_params = best_score, best_params
# print('RF Best Score:', rf_best_score)
# print('And Best Parameters:', rf_best_params)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:  5.4min remaining:   49.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  5.9min finished


LR Best Score: 95.14
And Best Parameters: {'C': 1.0, 'penalty': 'l1'}


In [15]:
lr_best_params = {'C': 1.0, 'penalty': 'l1'}
gbc_best_params = {'learning_rate': 0.01, 'max_depth': 4, 'max_features': 1.0, 'min_samples_split': 2, 'random_state': 43}
dt_best_params = {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 2, 'random_state': 43}
rf_best_params = {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 7, 'n_estimators': 25, 'random_state': 44}
xgbc_best_params = {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 550, 'random_state': 43, 'subsample': 0.9}

In [124]:
"""Tune XGBC's hyperparameters."""
tune_hyperparameters(xgbc, params = xgbc_params)
xgbc_best_score, xgbc_best_params = best_score, best_params

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 42.6min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 57.8min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 70.8min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 87.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 103.7min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 113.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 124.4min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 141.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed

In [129]:
'''Instantiate the models with optimized hyperparameters.'''
rf  = RandomForestClassifier(**rf_best_params)
gbc = GradientBoostingClassifier(**gbc_best_params)
knn = KNeighborsClassifier()
lr  = LogisticRegression(**lr_best_params)
dt  = DecisionTreeClassifier(**dt_best_params)
xgbc = XGBClassifier(**dt_best_params)

'''Train all the models with optimised hyperparameters.'''
models = {'RF':rf, 'GBC':gbc, 'KNN':knn, 'LR':lr, 'DT':dt, 'XGBC':xgbc}
score = []
for x, (keys, items) in enumerate(models.items()):
    # Train the models with optimized parameters using cross validation.
    # No need to fit the data. cross_val_score does that for us.
    # But we need to fit train data for prediction in the follow session.
    items.fit(X_train, y_train)
    scores = cross_val_score(items, X_train, y_train, cv = 3, scoring = 'accuracy')*100
    score.append(scores.mean())
    print('Mean Accuracy: %0.4f (+/- %0.4f) [%s]'  % (scores.mean(), scores.std(), keys))

Mean Accuracy: 95.1320 (+/- 0.0024) [RF]
Mean Accuracy: 95.1371 (+/- 0.0000) [GBC]
Mean Accuracy: 93.7156 (+/- 0.7781) [KNN]




Mean Accuracy: 95.1371 (+/- 0.0000) [LR]
Mean Accuracy: 95.1376 (+/- 0.0004) [DT]
Mean Accuracy: 95.1371 (+/- 0.0000) [XGBC]


In [49]:
'''Make prediction using all the trained models.'''
model_prediction = pd.DataFrame({'RF':rf.predict_proba(X_test)[:,1], 'GBC':gbc.predict_proba(X_test)[:,1], 
                                 'DT':dt.predict_proba(X_test)[:,1], 'KNN':knn.predict_proba(X_test)[:,1], 
                                 'LR':lr.predict_proba(X_test)[:,1],'XGBC':xgbc.predict_proba(X_test)[:,1]})

"""Let's see how each model classifies a prticular class."""
model_prediction.head()

Unnamed: 0,RF,GBC,DT,KNN,LR,XGBC
0,0.017032,0.028816,0.016731,0.0,0.005943,0.017751
1,0.0,0.022148,0.0,0.0,0.005474,0.002886
2,0.010207,0.022148,0.010568,0.0,0.007932,0.003572
3,0.055365,0.054368,0.054784,0.0,0.059208,0.063646
4,7.7e-05,0.022148,0.0,0.0,0.005658,0.003607


In [65]:
model_dict = {'RF':rf,'GBC':gbc,'DT':dt,'KNN':knn,'LR':lr,'XGBC':xgbc}

In [None]:
for idx, (key, value) in enumerate(model_dict.items()):
    auc_score = roc_auc_score(y_true=df_test.label, y_score=value.predict_proba(X_test)[:,1])
    acc = accuracy_score(y_true=df_test.label, y_pred=value.predict_proba(X_test).argmax(axis=1))
    print("{}: Validation AUC: {:.3f}, Accuracy: {:.3f}".format(key, auc_score, acc))

### XGBC: Validation AUC: 0.785, Accuracy: 0.952 (before optimized)

In [64]:
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

(306313, 19)


In [None]:
for idx, (key, value) in enumerate(model_dict.items()):
    y_test_pred = value.predict_proba(testset[X_test.columns])
    test1 = testset.copy()
    test1['pred_prob'] = y_test_pred[:, 1]
    #print(test1.shape)

    output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
    #print(output.shape)

    output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
    output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
    output.reset_index(drop=True, inplace=True)

    ### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
    out = output.groupby("uid", as_index=False).mean()
    out = out[["uid", "pred_prob"]]
    out.columns = ["uid", "label"]
    out.to_csv("{}.csv".format(key), header=["uid", "label"], index=False) # submission format

print('Done')

## try others

In [131]:
'''Initialize bagging classifier.'''
from sklearn.ensemble import BaggingClassifier
bagg = BaggingClassifier(base_estimator = rf, verbose = 10, n_jobs = -1, random_state = seed)
'''We use rf as the base estimator for bagging technique.'''
print('Fitting Bagging Ensemble...')
bagg.fit(X_train, y_train)
print('Done.')


y_test_pred = bagg.predict_proba(testset[X_test.columns])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
#print(test1.shape)

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
#print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("bagging.csv", header=["uid", "label"], index=False) # submission format

Fitting Bagging Ensemble...


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:   53.1s remaining:  2.7min
[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:   55.1s remaining:  1.5min
[Parallel(n_jobs=8)]: Done   4 out of   8 | elapsed:   55.7s remaining:   55.7s
[Parallel(n_jobs=8)]: Done   5 out of   8 | elapsed:   55.8s remaining:   33.4s
[Parallel(n_jobs=8)]: Done   6 out of   8 | elapsed:   56.2s remaining:   18.7s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  1.2min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Done.


[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.9s remaining:    8.9s
[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:    3.0s remaining:    5.1s
[Parallel(n_jobs=8)]: Done   4 out of   8 | elapsed:    3.0s remaining:    3.0s
[Parallel(n_jobs=8)]: Done   5 out of   8 | elapsed:    3.1s remaining:    1.8s
[Parallel(n_jobs=8)]: Done   6 out of   8 | elapsed:    3.1s remaining:    1.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    4.2s finished


In [134]:
'''We will use gradient boosting and extreme gradient boosting classifiers for boosting ensemble method.'''
'''Initialize boosting classifier. Base models for boosting:'''
from mlxtend.classifier import EnsembleVoteClassifier
boost_models = [gbc, xgbc]
boost = EnsembleVoteClassifier(clfs = boost_models, voting='hard')

'''Fitting boosting.'''
print('Fitting Boosting Ensemble...')
boost.fit(X_train, y_train)
print('Done.')

y_test_pred = boost.predict_proba(testset[X_test.columns])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
#print(test1.shape)

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
#print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("boosting.csv", header=["uid", "label"], index=False) # submission format

Fitting Boosting Ensemble...
Done.
