## Web Economic CW

In [1]:
import warnings; warnings.filterwarnings("ignore");

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import metrics
from pyfm import pylibfm

import pickle
import time
start_time = time.time()

pd.options.mode.chained_assignment = None  # default='warn'



In [2]:
#Read CSV data
df_train = pd.read_csv('dataset/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('dataset/test.csv', encoding="ISO-8859-1")
df_validation = pd.read_csv('dataset/validation.csv', encoding="ISO-8859-1")
# raw_data = pd.read_csv('train.csv', header=None, sep=',')

In [3]:
df_train['click'].value_counts()/ df_train.shape[0]

0    0.999246
1    0.000754
Name: click, dtype: float64

In [4]:
impression = 2664159
#impression = 0
#for i in range (df_train.shape[0]):
#    if df_train['bidprice'][i] >= df_train['payprice'][i]:
        
#        impression += 1
#return impression
impression        

2664159

In [5]:
average_ctr = (df_train['click'].value_counts()/impression)[1]
average_ctr

0.00076346794616987952

## Downsampling

In [6]:
df_train_0 = df_train[df_train['click'] == 0]
df_train_1 = df_train[df_train['click'] == 1]

In [7]:
df_train_0 = df_train_0.sample(frac=0.0075, random_state=2017)

In [8]:
df_train = pd.concat([df_train_1, df_train_0])

## Feature extraction & hashing

In [9]:
df_train.columns

Index(['click', 'weekday', 'hour', 'bidid', 'logtype', 'userid', 'useragent',
       'IP', 'region', 'city', 'adexchange', 'domain', 'url', 'urlid',
       'slotid', 'slotwidth', 'slotheight', 'slotvisibility', 'slotformat',
       'slotprice', 'creative', 'bidprice', 'payprice', 'keypage',
       'advertiser', 'usertag'],
      dtype='object')

In [10]:
header_train = [u'click', u'weekday', u'hour', u'bidid', u'logtype', u'userid', u'useragent',
       u'IP', u'region', u'city', u'adexchange', u'domain', u'url', u'urlid',
       u'slotid', u'slotwidth', u'slotheight', u'slotvisibility', u'slotformat',
       u'slotprice', u'creative', u'bidprice', u'payprice', u'keypage',
       u'advertiser', u'usertag']
X_header = [u'click', u'weekday', u'hour', u'bidid', u'logtype', u'userid', u'useragent',
       u'IP', u'region', u'city', u'adexchange', u'domain', u'url', u'urlid',
       u'slotid', u'slotwidth', u'slotheight', u'slotvisibility', u'slotformat',
       u'slotprice', u'creative', u'bidprice', u'payprice', u'keypage',
       u'advertiser', u'usertag']


In [11]:
df_train.columns = header_train
X = np.array(df_train[X_header])

In [12]:
X[0]

array([1, 6, 16, '61868b8d67b4a94f917cb98cb579578f', 1,
       'u_Vhk7PrMPOZM6GQ2', 'windows_ie', '1.86.243.*', 333, 334, '2',
       'trqRTuMvjTN7X9KbuKz', '9d960a6b69c7a2ca69625dec449e338a', 'null',
       '593718700', 728, 90, '0', '0', 192,
       '4ad7e35171a3d8de73bb862791575f2e', 238, 216,
       'd29e59bf0f7f8243858b8183f14d4412', 3358,
       '10083,13776,10059,10057,10110'], dtype=object)

In [13]:
D = 50000

def process_row(line, header):
    rst = []
    non_use_keys = set(['logtype', 'click', 'bidid', 'bidprice', 'payprice'])
    for key,value in zip(header, line):
        value = str(value)
        key = str(key)
        if value == '' or key in non_use_keys:
            continue
#         elif key == 'timestamp':
#             rst += ['time_day_'+value[6:8]]
        elif key == 'useragent':
            value = value.split('_')
            rst += ['useragent_os_'+value[0]]
            rst += ['useragent_browser_'+value[1]]
        elif key == 'usertag':
            for sub_value in value.split(','):
                rst += ['usertags_'+str(sub_value)]
        else:
            rst += [key+'_'+value]
    return [abs(hash(v)) % D for v in rst]

In [14]:
X = [process_row(x, X_header) for x in X]

In [15]:
np.std([len(x) for x in X])

4.2107345943185512

In [16]:
Y = np.array(df_train[['bidid', 'click', 'payprice']])

In [17]:
with open('train.xyz.csv','w') as f:
    for y, x in zip(Y, X):
        line = ','.join(map(str, y)) + ',' + ','.join(map(str, x)) + '\n'
        f.write(line)

In [18]:
# for validation data

X_val = np.array(df_validation[X_header])
df_validation.columns = header_train
X_val = np.array(df_validation[X_header])
Y_val = np.array(df_validation[['bidid', 'click', 'payprice']])
X_val = [process_row(x, X_header) for x in X_val]

In [19]:
with open('validation.xyz.csv','w') as f:
    for y, x in zip(Y_val, X_val):
        line = ','.join(map(str, y)) + ',' + ','.join(map(str, x)) + '\n'
        f.write(line)

In [20]:
# for test data
header_test = [u'weekday', u'hour', u'bidid', u'logtype',
       u'userid', u'useragent', u'IP', u'region', u'city', u'adexchange',
       u'domain', u'url', u'urlid', u'slotid', u'slotwidth', u'slotheight',
       u'slotvisibility', u'slotformat', u'slotprice', u'creative',
        u'keypage', u'advertiser', u'usertag']
X_header = [u'weekday', u'hour', u'bidid', u'logtype',
       u'userid', u'useragent', u'IP', u'region', u'city', u'adexchange',
       u'domain', u'url', u'urlid', u'slotid', u'slotwidth', u'slotheight',
       u'slotvisibility', u'slotformat', u'slotprice', u'creative',
     u'keypage', u'advertiser', u'usertag']

df_test.columns = header_test
X_test = np.array(df_test[X_header])
Y_test = np.array(df_test[['bidid']])
X_test = [process_row(x, X_header) for x in X_test]

In [21]:
with open('test.xyz.csv','w') as f:
    for y, x in zip(Y_test, X_test):
        line = ','.join(map(str, y)) + ',' + ','.join(map(str, x)) + '\n'
        f.write(line)

In [22]:
X = []
Y = []
with open('train.xyz.csv','r') as f:
    for row in f.readlines():
        r = row.strip().split(',')
        Y.append(r[0:3])
        X.append(list(map(int,r[3:])))
               
X_val = []
Y_val = []
with open('validation.xyz.csv','r') as f:
    for row in f.readlines():
        r = row.strip().split(',')
        Y_val.append(r[0:3])
        X_val.append(list(map(int,r[3:])))
        
X_test = []
Y_test = []
with open('test.xyz.csv','r') as f:
    for row in f.readlines():
        r = row.strip().split(',')
        Y_test.append(r[0:1])
        X_test.append(list(map(int,r[1:])))

In [23]:
X[0]

[25555,
 5679,
 13501,
 10061,
 8471,
 29474,
 32274,
 33549,
 17563,
 45779,
 47731,
 48679,
 41427,
 7447,
 26703,
 16063,
 311,
 25182,
 29730,
 39871,
 16760,
 27390,
 42721,
 32104,
 32374,
 34129]

In [24]:
X = np.array([np.array(x, dtype=np.int32) for x in X])
X_val = np.array([np.array(x, dtype=np.int32) for x in X_val])
X_test = np.array([np.array(x, dtype=np.int32) for x in X_test])
Y = np.array(Y)
shuffle = np.random.permutation(len(Y))
X = X[shuffle]
Y = Y[shuffle]

from sklearn.feature_extraction import DictVectorizer
train = [{str(xx):1 for xx in x} for x in X]
val = [{str(xx):1 for xx in x} for x in X_val]
test = [{str(xx):1 for xx in x} for x in X_test]
v = DictVectorizer()
X_FM = v.fit_transform(train)
X_val_FM = v.transform(val)
X_test_FM = v.fit_transform(test)

Y_val = np.array(Y_val)
target = np.array(Y[:,1], dtype=np.int32)
target_val = np.array(Y_val[:,1], dtype=np.int32)

# Model

In [25]:
# Caculate the calibrated CTR
def Cali_CTR(p, w):

    cali_ctr = p/(p+(1-p)/w)

    return cali_ctr   

In [26]:
# Caculate the root mean square rooot
def rmse(predictions, targets):

    differences = predictions - targets                       #the DIFFERENCEs.

    differences_squared = differences ** 2                    #the SQUAREs of ^

    mean_of_differences_squared = differences_squared.mean()  #the MEAN of ^

    rmse_val = np.sqrt(mean_of_differences_squared)           #ROOT of ^

    return rmse_val     #get the ^

##  Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=1000,
                                  max_features = 0.5,
                                  min_samples_leaf = 50,
                                  oob_score = True,
                                  max_depth = 8,
                                  n_jobs = -1,
                                  random_state = 2017).fit(X_FM, target)
#pickle.dump(rf_model,open('random_forest_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 1.14 minutes ---


In [28]:
start_time = time.time()
# predict with model
#rf_model = pickle.load(open('random_forest_model.sav', 'rb'))  
Cali_rf = Cali_CTR(rf_model.predict_proba(X_val_FM), 0.0075)[:,1]

print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.6 minutes ---


In [29]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_rf, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_rf)))

auc on test:  0.875567159158
RMSE on test dataset = 0.03


In [30]:
df_train.iloc[20]

click                                            1
weekday                                          3
hour                                             9
bidid             0942e488346a1a30cfe08f80425bc278
logtype                                          1
userid                           u_Vh5hC3MlP9udlCl
useragent                               windows_ie
IP                                    110.81.254.*
region                                         124
city                                           129
adexchange                                       2
domain                                        null
url               1094ee84b13d866cb43c94f9b986e1f5
urlid                                         null
slotid                                   463662836
slotwidth                                      728
slotheight                                      90
slotvisibility                                   2
slotformat                                       0
slotprice                      

In [31]:
aaaaaaa = pd.DataFrame({'test':rf_model.feature_importances_})

In [32]:
aaaaaaa.head()

Unnamed: 0,test
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [33]:
aaaaaaa.sort_values(by=['test'],inplace=True,ascending=False)

In [34]:
aaaaaaa

Unnamed: 0,test
35061,0.229885
6738,0.136558
50,0.088494
12225,0.061008
9794,0.059942
35375,0.057616
11187,0.043539
24017,0.040312
5478,0.037641
5940,0.026839


In [160]:
#zip(map(lambda x: x, rf_model.feature_importances_)

### Extra Tree 

In [38]:
from sklearn.ensemble import ExtraTreesClassifier

start_time = time.time()
extra_model = ExtraTreesClassifier(bootstrap=False, 
                                   min_samples_leaf=2,
                                   min_samples_split=6,
                                   n_estimators=100,
                                   n_jobs=-1,
                                   random_state=2017).fit(X_FM, target)
pickle.dump(extra_model,open('extra_tree_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.04 minutes ---


In [39]:
start_time = time.time()
# predict with model
extra_model = pickle.load(open('extra_tree_model.sav', 'rb'))

Cali_extra = Cali_CTR(extra_model.predict_proba(X_val_FM), 0.0075)[:,1]
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.09 minutes ---


In [40]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_extra, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_extra)))

auc on test:  0.823916073755
RMSE on test dataset = 0.03


### Decision Tree 

In [41]:
from sklearn.tree import DecisionTreeClassifier

start_time = time.time()
decision_model = DecisionTreeClassifier(criterion='gini', 
                                        splitter='best', #'random' 
                                        max_depth=8, 
                                        min_samples_split=4, 
                                        min_samples_leaf=1, 
                                        random_state=2017, 
                                        class_weight='balanced').fit(X_FM, target)
pickle.dump(decision_model,open('decision_tree_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.01 minutes ---


In [42]:
start_time = time.time()
# predict with model
decision_model = pickle.load(open('decision_tree_model.sav', 'rb'))
Cali_dec = Cali_CTR(decision_model.predict_proba(X_val_FM), 0.0075)[:,1]
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.0 minutes ---


In [43]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_dec, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_dec)))

auc on test:  0.844066542203
RMSE on test dataset = 0.04


## Logistic Regression 

In [197]:
from sklearn.linear_model import LogisticRegression

start_time = time.time()
lr_model = LogisticRegression(C=0.137,#9,  #0.665045541881
                              penalty='l2',
                              class_weight='balanced',
                              fit_intercept=True,
                              intercept_scaling=1,
                              max_iter=20,
                              multi_class='ovr',
                              n_jobs = -1,
                              random_state=2017,
                              solver='liblinear',
                              verbose=True).fit(X_FM,target)
pickle.dump(lr_model,open('logistic_regression_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

[LibLinear]--- time use: 0.01 minutes ---


In [198]:
start_time = time.time()
lr_model = pickle.load(open('logistic_regression_model.sav', 'rb'))
Cali_lr = Cali_CTR(lr_model.predict(X_val_FM), 0.0075)
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.0 minutes ---


In [199]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_lr, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_lr)))

auc on test:  0.774905595472
RMSE on test dataset = 0.35


## Elastic Net 

In [206]:
from sklearn.linear_model import ElasticNet

start_time = time.time()
elastic_model = ElasticNet(alpha=0.0005,
                        l1_ratio = 0.5, #0.7
                        random_state = 2017).fit(X_FM, target)
pickle.dump(elastic_model,open('elatic_net_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 4.77 minutes ---


In [207]:
start_time = time.time()
elastic_model = pickle.load(open('elatic_net_model.sav', 'rb'))
Cali_elas = Cali_CTR(np.expm1(elastic_model.predict(X_val_FM)), 0.0075)
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.0 minutes ---


In [208]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_elas, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_elas)))

auc on test:  0.801791308357
RMSE on test dataset = 0.05


## SGDClassifier

In [151]:
from sklearn.linear_model import SGDClassifier

start_time = time.time()
sgd_model = SGDClassifier(loss='log',
                          penalty="elasticnet",
                          #penalty='l2',
                          alpha=0.0003,
                          l1_ratio=0.009,
                          fit_intercept=True,
                          n_iter=5,
                          shuffle=True,
                          verbose=0,
                          epsilon=0.1,
                          n_jobs = -1,
                          random_state=2017,
                          learning_rate='optimal', 
                          eta0=0.0, 
                          power_t=0.5, 
                          class_weight=None, 
                          warm_start=False, 
                          average=False).fit(X_FM, target)
pickle.dump(sgd_model,open('SGDClassifier_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.0 minutes ---


In [152]:
start_time = time.time()
sgd_model = pickle.load(open('SGDClassifier_model.sav', 'rb'))
Cali_sgd = Cali_CTR(sgd_model.predict_proba(X_val_FM), 0.0075)[:,1]
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.0 minutes ---


In [153]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_sgd, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_sgd)))

auc on test:  0.829974851755
RMSE on test dataset = 0.03


## XGBClassifier

In [56]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_FM, label=target)
dval = xgb.DMatrix(X_val_FM, label=target_val)
dtest = xgb.DMatrix(X_test_FM)

In [57]:
start_time = time.time()
#xgbclassifier
xgb_model = xgb.XGBClassifier(max_depth=6,
                         learning_rate=0.095,
                         n_estimators=1000,
                         silent=True,
                         objective='binary:logistic',
                         nthread=-1,
                         gamma=0, 
                         min_child_weight=1,
                         max_delta_step=0,
                         subsample=1,
                         colsample_bytree=1,
                         colsample_bylevel=1,
                         reg_alpha=0,
                         reg_lambda=1,
                         scale_pos_weight=1,
                         base_score=0.5,
                         seed=2017).fit(X_FM, target)
pickle.dump(xgb_model,open('XGBClassifier_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.69 minutes ---


In [60]:
start_time = time.time()
xgb_model = pickle.load(open('XGBClassifier_model.sav', 'rb'))
Cali_xgb = Cali_CTR(xgb_model.predict_proba(X_val_FM), 0.0075)[:,1]
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.15 minutes ---


In [61]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_xgb, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_xgb)))

auc on test:  0.87568908606
RMSE on test dataset = 0.03


## XGBoost 

In [62]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_FM, label=target)
dval = xgb.DMatrix(X_val_FM, label=target_val)
dtest = xgb.DMatrix(X_test_FM)

In [63]:
start_time = time.time()

learning_rate = 0.1 #0.879086213156
max_depth = 12
min_child_weight = 6
num_round =50

xgb_params = {
    #General Parameters
    'booster': 'gbtree',
    'silent': 0,    
    #Tree Booster Parameters
    'eta': learning_rate,    
    'max_depth': max_depth, #usually 3-10, big value can provide from overfitting
    'min_child_weight': min_child_weight,
    'subsample': 1,
    'colsample_bytree': 1,
    'alpha': 1, # improve speed
    'scale_pos_weight': 1, #default = 1, positive value can converge faster
    'early_stopping_rounds': 50,   
    #learn objective parameters
    'objective': 'reg:logistic',
    'evalmatrix': 'rmse',
    'seed': 2017}

watchlist = [(dtrain, 'train'), (dval, 'eval')]

bst_model = xgb.train(xgb_params, dtrain, num_round, watchlist)
pickle.dump(bst_model,open('XGBoost_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

[0]	train-rmse:0.460769	eval-rmse:0.456051
[1]	train-rmse:0.426627	eval-rmse:0.416922
[2]	train-rmse:0.396816	eval-rmse:0.382042
[3]	train-rmse:0.370844	eval-rmse:0.350959
[4]	train-rmse:0.348303	eval-rmse:0.323199
[5]	train-rmse:0.328774	eval-rmse:0.298434
[6]	train-rmse:0.311938	eval-rmse:0.27639
[7]	train-rmse:0.29747	eval-rmse:0.256811
[8]	train-rmse:0.285041	eval-rmse:0.239305
[9]	train-rmse:0.274458	eval-rmse:0.223594
[10]	train-rmse:0.265497	eval-rmse:0.209648
[11]	train-rmse:0.257818	eval-rmse:0.197398
[12]	train-rmse:0.251303	eval-rmse:0.186475
[13]	train-rmse:0.245819	eval-rmse:0.176787
[14]	train-rmse:0.241111	eval-rmse:0.168123
[15]	train-rmse:0.237169	eval-rmse:0.160569
[16]	train-rmse:0.233817	eval-rmse:0.153872
[17]	train-rmse:0.230905	eval-rmse:0.147859
[18]	train-rmse:0.228527	eval-rmse:0.142651
[19]	train-rmse:0.226359	eval-rmse:0.137899
[20]	train-rmse:0.22451	eval-rmse:0.133782
[21]	train-rmse:0.223004	eval-rmse:0.130187
[22]	train-rmse:0.22169	eval-rmse:0.126964
[2

In [64]:
start_time = time.time()
bst_model = pickle.load(open('XGBoost_model.sav', 'rb'))
Cali_bst = Cali_CTR(bst_model.predict(dval), 0.0075)
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.01 minutes ---


In [65]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_bst, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_bst)))

auc on test:  0.893795988128
RMSE on test dataset = 0.03


## AdaBoost 

In [177]:
from sklearn.ensemble import AdaBoostClassifier
start_time = time.time()

ada_model = AdaBoostClassifier(base_estimator=None, 
                               n_estimators=100, 
                               learning_rate=0.5, 
                               random_state=2017).fit(X_FM,target)
pickle.dump(ada_model,open('AdaBoost_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.07 minutes ---


Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x152d01a20>>
Traceback (most recent call last):
  File "/anaconda/lib/python3.6/site-packages/xgboost-0.6-py3.6.egg/xgboost/core.py", line 337, in __del__
    _check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'


In [178]:
start_time = time.time()
ada_model = pickle.load(open('AdaBoost_model.sav', 'rb'))
Cali_ada = Cali_CTR(ada_model.predict_proba(X_val_FM), 0.0075)[:,1]
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.11 minutes ---


In [179]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_ada, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_ada)))

auc on test:  0.840433782339
RMSE on test dataset = 0.03


## Neural Network

In [219]:
from sklearn.neural_network import MLPClassifier
start_time = time.time()

nn_model = MLPClassifier(hidden_layer_sizes=(140,70),
                   activation = 'logistic',
                   solver = 'lbfgs',
                   max_iter = 150,
                   learning_rate_init = 0.75,
                   alpha = 0.0001, #l2 penalty
                   random_state=2017).fit(X_FM, target)

pickle.dump(nn_model,open('Neural_Network_model.sav', 'wb'))
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 2.14 minutes ---


In [220]:
start_time = time.time()
nn_model = pickle.load(open('Neural_Network_model.sav', 'rb'))

Cali_nn = Cali_CTR(nn_model.predict_proba(X_val_FM), 0.0075)[:,1]
print("--- time use: %s minutes ---" % round(((time.time() - start_time)/60),2))

--- time use: 0.03 minutes ---


In [221]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_nn, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_nn)))

auc on test:  0.843039813835
RMSE on test dataset = 0.05


## Bidding Strategy


Apply the pCTR to the validation set and evaluate with pay price for each impression

In [143]:
def evaluation (pBidprice, payprice, clicks, budget=6250):
    win_num = 0.
    total_cost = 0.
    click_num = 0.
    for i in range(0, len(pBidprice)):
        if pBidprice[i] > payprice[i]:
            if total_cost + payprice[i] / 1000. > budget:
                break
            win_num = win_num + 1
            total_cost = total_cost + payprice[i] / 1000.
            click_num += clicks[i]
    ctr = click_num / win_num
    cpc = total_cost / click_num
    print('win_num {}, total_cost {}, click_num {}, ctr {}, cpc {}'.format
          (win_num, total_cost, click_num, ctr, cpc))
    return win_num, total_cost, click_num, ctr, cpc       

In [144]:
def pbidprice(ctr, base_bid, flag):
    pbid = []
    for i in range(len(ctr)):
        if flag == 'linear':
            p = base_bid * ((ctr[i]/average_ctr))            
            pbid.append(p)
        elif flag == 'squared':
            p = base_bid * ((ctr[i]/average_ctr)**2)            
            pbid.append(p)
        elif flag == 'constant':
            pbid.append(base_bid)
        elif flag == 'random':
            p = np.random.randint(100,200)
            pbid.append(p)
        elif flag == 'gated':
            tradeoff = (ctr[i]/average_ctr)
            if tradeoff > 1:
                p = 124 # get from constant strategy
            else: p = 1
            pbid.append(p) 
            
    if flag == 'linear':
        print(" **LINEAR** ")
    elif flag == 'squared':
        print("**SQUARED**")
    elif flag == 'constant':
        print("**CONSTANT**")
    elif flag == "random":
        print("**RANDOM**")
    elif flag == "gated":
        print("**GATED**")
    return pbid

In [71]:
Payprice_val = np.array(Y_val[:,2], dtype=np.int32)
clicks = np.array(Y_val[:,1], dtype=np.int32)
#pCTR = Cali_xgb

In [72]:
Payprice_train = np.array(Y[:,2], dtype=np.int32)
clicks_train = np.array(Y[:,1], dtype=np.int32)
#pCTR = Cali_xgb

### Constant Bidding Strategy

In [113]:
def constant_bidding(base_bid, Payprice):
    base = 300
    win_number = []
    totalcost = []
    clicknum = []
    ctr_p = []
    cpc_p = []
    bid_p = []
    Cali_constant = [[]]*len(Payprice)
    for i in range (50):

        constant_bid = pbidprice(Cali_constant, base, 'constant')
        win_num, total_cost, click_num, ctr, cpc = evaluation(constant_bid, Payprice_train, clicks_train)

        #win_num, total_cost, click_num, ctr, cpc = evaluation(constant_bid, Payprice_val, clicks)
        base+=1

        win_number.append(win_num)
        totalcost.append(total_cost)
        clicknum.append(click_num)
        ctr_p.append(ctr)
        cpc_p.append(cpc)
        bid_p.append(constant_bid)
        print(i+1,'/50')
    return best_bid

In [254]:
#on training
base = 70
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []
bid_p = []
#Cali_constant = [[]]*len(Payprice_train)
Cali_constant = [[]]*len(Payprice_val)


for i in range (50):

    constant_bid = pbidprice(Cali_constant, base, 'constant')
    #win_num, total_cost, click_num, ctr, cpc = evaluation(constant_bid, Payprice_train, clicks_train)

    win_num, total_cost, click_num, ctr, cpc = evaluation(constant_bid, Payprice_val, clicks)
    base+=1
    
    win_number.append(win_num)
    totalcost.append(total_cost)
    clicknum.append(click_num)
    ctr_p.append(ctr)
    cpc_p.append(cpc)
    bid_p.append(constant_bid)
    print(i+1,'/50')

**CONSTANT**
win_num 149629.0, total_cost 5238.832000002614, click_num 58.0, ctr 0.00038762539347319034, cpc 90.32468965521748
1 /50
**CONSTANT**
win_num 161979.0, total_cost 6249.966000002683, click_num 70.0, ctr 0.0004321547854968854, cpc 89.2852285714669
2 /50
**CONSTANT**
win_num 161122.0, total_cost 6249.976000002561, click_num 68.0, ctr 0.0004220404414046499, cpc 91.91141176474355
3 /50
**CONSTANT**
win_num 160445.0, total_cost 6249.935000002575, click_num 68.0, ctr 0.0004238212471563464, cpc 91.91080882356728
4 /50
**CONSTANT**
win_num 156607.0, total_cost 6249.99400000276, click_num 69.0, ctr 0.00044059333235423704, cpc 90.5796231884458
5 /50
**CONSTANT**
win_num 155792.0, total_cost 6249.99000000264, click_num 66.0, ctr 0.0004236417787819657, cpc 94.69681818185818
6 /50
**CONSTANT**
win_num 155061.0, total_cost 6249.984000002532, click_num 68.0, ctr 0.00043853709185417354, cpc 91.91152941180195
7 /50
**CONSTANT**
win_num 152959.0, total_cost 6249.948000002469, click_num 67.0, 

KeyboardInterrupt: 

In [246]:
print('clicks',max(clicknum),'base',clicknum.index(max(clicknum))+100)
best_clicks = max(clicknum)
best_bid = clicknum.index(max(clicknum))+100

clicks 89.0 base 120


### Random Bidding Strategy

In [245]:
rand_time = 50 

base = 1 # in random strategy, base is not used
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []
Cali_random = [[]]*len(Payprice_val)
#Cali_random = [[]]*len(Payprice_train)


for i in range (rand_time):

    random_bid = pbidprice(Cali_random, base, 'random')
    #win_num, total_cost, click_num, ctr, cpc = evaluation(random_bid, Payprice_train, clicks_train)
    win_num, total_cost, click_num, ctr, cpc = evaluation(random_bid, Payprice_val, clicks)
    
    win_number.append(win_num)
    totalcost.append(total_cost)
    clicknum.append(click_num)
    ctr_p.append(ctr)
    cpc_p.append(cpc)
    bid_p.append(random_bid)
    print(i+1,'/', rand_time)


**RANDOM**
win_num 105678.0, total_cost 6249.980000001479, click_num 88.0, ctr 0.0008327182573477924, cpc 71.0225000000168
1 / 50
**RANDOM**
win_num 105669.0, total_cost 6249.965000001457, click_num 83.0, ctr 0.0007854716141914847, cpc 75.30078313254768
2 / 50
**RANDOM**
win_num 105599.0, total_cost 6249.992000001482, click_num 85.0, ctr 0.0008049318648850842, cpc 73.52931764707625
3 / 50
**RANDOM**
win_num 105800.0, total_cost 6249.987000001472, click_num 83.0, ctr 0.0007844990548204159, cpc 75.30104819278881
4 / 50
**RANDOM**
win_num 105574.0, total_cost 6249.970000001465, click_num 83.0, ctr 0.0007861784151400914, cpc 75.30084337351163
5 / 50
**RANDOM**
win_num 105812.0, total_cost 6249.986000001495, click_num 86.0, ctr 0.0008127622575889313, cpc 72.67425581397087
6 / 50
**RANDOM**
win_num 105640.0, total_cost 6249.992000001489, click_num 84.0, ctr 0.0007951533510034078, cpc 74.40466666668439
7 / 50
**RANDOM**
win_num 105854.0, total_cost 6249.965000001487, click_num 86.0, ctr 0.000

In [247]:
rand_a = sum(win_number)/rand_time
rand_b = sum(totalcost)/rand_time
rand_c = sum(clicknum)/rand_time
rand_d = sum(ctr_p)/rand_time
rand_e = sum(cpc_p)/rand_time
print('win_num {}, total_cost {}, click_num {}, ctr {}, cpc {}'.format(rand_a, rand_b, np.round(rand_c), rand_d, rand_e))

win_num 105669.82, total_cost 6249.967160001479, click_num 84.0, ctr 0.0007943616372548433, cpc 74.51479996684266


### Gated Bidding Strategy 

In [147]:
#base = 124.
base = 100                                                                                                                                                                                                                                                            
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []
bid_p = []
#Cali_constant = [[]]*len(Payprice_val)


for i in range (10):

    bidprice = pbidprice(Cali_bst, base, 'gated')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bidprice, Payprice_val, clicks)
    base+=1
    
    win_number.append(win_num)
    totalcost.append(total_cost)
    clicknum.append(click_num)
    ctr_p.append(ctr)
    cpc_p.append(cpc)
    bid_p.append(bidprice)
    print(i+1,'/10')

**GATED**
win_num 30910.0, total_cost 1818.069999999824, click_num 113.0, ctr 0.0036557748301520544, cpc 16.08911504424623
1 /10
**GATED**
win_num 30910.0, total_cost 1818.069999999824, click_num 113.0, ctr 0.0036557748301520544, cpc 16.08911504424623
2 /10
**GATED**
win_num 30910.0, total_cost 1818.069999999824, click_num 113.0, ctr 0.0036557748301520544, cpc 16.08911504424623
3 /10
**GATED**
win_num 30910.0, total_cost 1818.069999999824, click_num 113.0, ctr 0.0036557748301520544, cpc 16.08911504424623
4 /10
**GATED**
win_num 30910.0, total_cost 1818.069999999824, click_num 113.0, ctr 0.0036557748301520544, cpc 16.08911504424623
5 /10
**GATED**
win_num 30910.0, total_cost 1818.069999999824, click_num 113.0, ctr 0.0036557748301520544, cpc 16.08911504424623
6 /10
**GATED**
win_num 30910.0, total_cost 1818.069999999824, click_num 113.0, ctr 0.0036557748301520544, cpc 16.08911504424623
7 /10
**GATED**
win_num 30910.0, total_cost 1818.069999999824, click_num 113.0, ctr 0.00365577483015205

### Squared Bidding Strategy 

In [104]:
base = 130. #139
bidp = []
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []

rf_model = pickle.load(open('random_forest_model.sav', 'rb'))
Cali_rf = Cali_CTR(rf_model.predict(X_val_FM), 0.0075)

for i in range(10):
    bid_p = pbidprice(Cali_rf, base, 'squared')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bid_p,Payprice_val, clicks)
    print(base)
#     win_number.append(win_num)
#     totalcost.append(total_cost)
#     clicknum.append(click_num)
#     ctr_p.append(ctr)
#     cpc_p.append(cpc)
#     bidp.append(bid_p)
    base += 1  

this is a **SQUARED** strategy to get predict bid price
win_num 1973.0, total_cost 149.21900000000002, click_num 100.0, ctr 0.05068423720223011, cpc 1.4921900000000001
130.0
this is a **SQUARED** strategy to get predict bid price
win_num 1973.0, total_cost 149.21900000000002, click_num 100.0, ctr 0.05068423720223011, cpc 1.4921900000000001
131.0
this is a **SQUARED** strategy to get predict bid price
win_num 1973.0, total_cost 149.21900000000002, click_num 100.0, ctr 0.05068423720223011, cpc 1.4921900000000001
132.0
this is a **SQUARED** strategy to get predict bid price
win_num 1973.0, total_cost 149.21900000000002, click_num 100.0, ctr 0.05068423720223011, cpc 1.4921900000000001
133.0
this is a **SQUARED** strategy to get predict bid price
win_num 1973.0, total_cost 149.21900000000002, click_num 100.0, ctr 0.05068423720223011, cpc 1.4921900000000001
134.0
this is a **SQUARED** strategy to get predict bid price
win_num 1973.0, total_cost 149.21900000000002, click_num 100.0, ctr 0.0506

#stacking
win_num 89099.0, total_cost 6499.991000000484, click_num 186.0, ctr 0.0020875655170091694, cpc 34.94618817204561
224.0
#rf
win_num 114513.0, total_cost 6499.993000001925, click_num 185.0, ctr 0.0016155371005911992, cpc 35.1350972973077
170.0

### Use single model in Linear Bidding Strategy

### rf

In [116]:
base = 200. #139
bidp = []
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []

for i in range(50):
    bid_p = pbidprice(Cali_rf, base, 'squared')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bid_p,Payprice_val, clicks)
    print(base)
#     win_number.append(win_num)
#     totalcost.append(total_cost)
#     clicknum.append(click_num)
#     ctr_p.append(ctr)
#     cpc_p.append(cpc)
#     bidp.append(bid_p)
    base += 1  

this is a **SQUARED** strategy to get predict bid price
win_num 111324.0, total_cost 5615.257000002218, click_num 178.0, ctr 0.0015989364377852035, cpc 31.546387640461898
200.0
this is a **SQUARED** strategy to get predict bid price
win_num 111716.0, total_cost 5639.916000002247, click_num 178.0, ctr 0.001593325933617387, cpc 31.68492134832723
201.0
this is a **SQUARED** strategy to get predict bid price
win_num 112118.0, total_cost 5662.718000002259, click_num 178.0, ctr 0.0015876130505360425, cpc 31.813022471922803
202.0
this is a **SQUARED** strategy to get predict bid price
win_num 112588.0, total_cost 5687.9100000022745, click_num 178.0, ctr 0.001580985540199666, cpc 31.95455056181053
203.0
this is a **SQUARED** strategy to get predict bid price
win_num 113110.0, total_cost 5716.12900000229, click_num 178.0, ctr 0.0015736893289717974, cpc 32.11308426967579
204.0
this is a **SQUARED** strategy to get predict bid price
win_num 113584.0, total_cost 5741.1570000023075, click_num 178.0

In [227]:
base =400. #139
bidp = []
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []

for i in range(50):
    bid_p = pbidprice(Cali_nn, base, 'linear')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bid_p,Payprice_val, clicks)
    print(base)
#     win_number.append(win_num)
#     totalcost.append(total_cost)
#     clicknum.append(click_num)
#     ctr_p.append(ctr)
#     cpc_p.append(cpc)
#     bidp.append(bid_p)
    base += 1  

 **LINEAR** 
win_num 97976.0, total_cost 6215.379000001488, click_num 175.0, ctr 0.0017861517106230098, cpc 35.51645142857993
400.0
 **LINEAR** 
win_num 98068.0, total_cost 6222.983000001484, click_num 175.0, ctr 0.001784476077823551, cpc 35.559902857151336
401.0
 **LINEAR** 
win_num 98167.0, total_cost 6231.493000001492, click_num 175.0, ctr 0.0017826764595026841, cpc 35.60853142857995
402.0
 **LINEAR** 
win_num 98275.0, total_cost 6239.987000001494, click_num 175.0, ctr 0.0017807173747138134, cpc 35.65706857143711
403.0
 **LINEAR** 
win_num 98375.0, total_cost 6248.123000001502, click_num 175.0, ctr 0.0017789072426937739, cpc 35.703560000008586
404.0
 **LINEAR** 
win_num 98375.0, total_cost 6249.892000001503, click_num 175.0, ctr 0.0017789072426937739, cpc 35.71366857143716
405.0
 **LINEAR** 
win_num 98356.0, total_cost 6249.987000001495, click_num 174.0, ctr 0.0017690837366302005, cpc 35.919465517249975
406.0
 **LINEAR** 
win_num 98328.0, total_cost 6249.9910000015, click_num 174.0,

KeyboardInterrupt: 

### xgb 

In [191]:
base = 150. #172
bidp = []
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []
for i in range(30):
    bid_p = pbidprice(Cali_xgb, base, 'linear')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bid_p,Payprice_val, clicks)
    print(base)
#     win_number.append(win_num)
#     totalcost.append(total_cost)
#     clicknum.append(click_num)
#     ctr_p.append(ctr)
#     cpc_p.append(cpc)
#     bidp.append(bid_p)
    base += 1

 **LINEAR** 
win_num 101710.0, total_cost 5814.7860000013015, click_num 184.0, ctr 0.0018090649886933438, cpc 31.60209782609403
150.0
 **LINEAR** 
win_num 102129.0, total_cost 5844.401000001307, click_num 185.0, ctr 0.0018114345582547562, cpc 31.59135675676382
151.0
 **LINEAR** 
win_num 102548.0, total_cost 5873.363000001311, click_num 185.0, ctr 0.0018040332332176152, cpc 31.747908108115194
152.0
 **LINEAR** 
win_num 103002.0, total_cost 5904.54500000132, click_num 185.0, ctr 0.0017960816294829228, cpc 31.916459459466594
153.0
 **LINEAR** 
win_num 103426.0, total_cost 5935.272000001313, click_num 185.0, ctr 0.0017887185040512056, cpc 32.08255135135845
154.0
 **LINEAR** 
win_num 103833.0, total_cost 5965.851000001318, click_num 185.0, ctr 0.0017817071643889707, cpc 32.24784324325037
155.0
 **LINEAR** 
win_num 104290.0, total_cost 5999.905000001322, click_num 185.0, ctr 0.0017738997027519417, cpc 32.431918918926065
156.0
 **LINEAR** 
win_num 104713.0, total_cost 6033.4610000013245, clic

In [78]:
base = 180. #136
bidp = []
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []
for i in range(30):
    bid_p = pbidprice(Cali_bst, base, 'squared')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bid_p,Payprice_val, clicks)
    print(base)
#     win_number.append(win_num)
#     totalcost.append(total_cost)
#     clicknum.append(click_num)
#     ctr_p.append(ctr)
#     cpc_p.append(cpc)
#     bidp.append(bid_p)
    base += 1   

**SQUARED**
win_num 100968.0, total_cost 5874.841000001382, click_num 182.0, ctr 0.00180255130338325, cpc 32.279346153853744
180.0
**SQUARED**
win_num 101278.0, total_cost 5897.121000001387, click_num 183.0, ctr 0.0018069077193467486, cpc 32.224704918040366
181.0
**SQUARED**
win_num 101577.0, total_cost 5915.252000001388, click_num 183.0, ctr 0.0018015889423786881, cpc 32.32378142077261
182.0
**SQUARED**
win_num 101930.0, total_cost 5935.806000001393, click_num 183.0, ctr 0.0017953497498283135, cpc 32.43609836066335
183.0
**SQUARED**
win_num 102231.0, total_cost 5955.6650000014, click_num 183.0, ctr 0.0017900636793144937, cpc 32.544617486346446
184.0
**SQUARED**
win_num 102506.0, total_cost 5974.681000001404, click_num 183.0, ctr 0.0017852613505550895, cpc 32.64853005465248
185.0
**SQUARED**
win_num 102804.0, total_cost 5995.524000001398, click_num 183.0, ctr 0.001780086377961947, cpc 32.76242622951584
186.0
**SQUARED**
win_num 103108.0, total_cost 6017.0300000014, click_num 183.0, ctr

### nn 

In [135]:
base = 170. #185
bidp = []
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []
for i in range(16):
    bid_p = pbidprice(Cali_nn, base, 'linear')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bid_p,Payprice_val, clicks)
    print(base)
    base += 1

win_num 106058.0, total_cost 6127.671000001743, click_num 178.0, ctr 0.0016783269531765637, cpc 34.42511797753788
170.0
win_num 106395.0, total_cost 6153.728000001744, click_num 178.0, ctr 0.0016730109497626769, cpc 34.57150561798732
171.0
win_num 106718.0, total_cost 6180.110000001758, click_num 178.0, ctr 0.001667947300361701, cpc 34.71971910113347
172.0
win_num 107074.0, total_cost 6208.473000001765, click_num 179.0, ctr 0.0016717410389076714, cpc 34.684206703920474
173.0
win_num 107395.0, total_cost 6231.931000001777, click_num 179.0, ctr 0.0016667442618371433, cpc 34.81525698325015
174.0
win_num 107756.0, total_cost 6258.709000001785, click_num 179.0, ctr 0.001661160399420914, cpc 34.96485474861333
175.0
win_num 108099.0, total_cost 6286.171000001791, click_num 179.0, ctr 0.0016558895086911073, cpc 35.11827374302677
176.0
win_num 108419.0, total_cost 6309.834000001808, click_num 179.0, ctr 0.0016510021306228612, cpc 35.25046927375312
177.0
win_num 108728.0, total_cost 6333.7120000

# Feature Selection

In [615]:
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingClassifier

clf1 = Pipeline([
  ('feature_selection', SelectFromModel(LogisticRegression(C=0.01,#9,  #0.665045541881
                                                           penalty='l2',
                                                           class_weight='balanced',
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           max_iter=200,
                                                           multi_class='ovr',
                                                           n_jobs = -1,
                                                           random_state=2017,
                                                           solver='liblinear',
                                                           verbose=True))),
  ('classification', xgb.xgbclassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.5, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=2017, silent=True, subsample=1))
])
clf1.fit(X_FM,target)

[LibLinear]

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=200,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=2017,
          solver='liblinear', tol=0.0001, v...istic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=2017, silent=True, subsample=1))])

In [616]:
y_pred_xgb_ada = clf1.predict_proba(X_val_FM)
Cali_xgb_ada = Cali_CTR(y_pred_xgb_ada, 0.0075)[:,1]

In [617]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_xgb_ada, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_xgb_ada)))

auc on test:  0.869427300913
RMSE on test dataset = 0.03


# Stacking

 for categorical features: LR, FM
 use FM to train, get encoded features
 use linear model like LR to train new features

 for continuous features: 
 use GBDT(XGB) to turn continuous features to categorical features


#linear
---lr: 0.767409702075, 0.4
---elas: 0.801852253342, 0.1
sgd: 0.823197564068, 0.03

#boost
booster: 0.864006668834, 0.03
xgboost: 0.898314588633, 0.03
adaboost: 0.839688962382, 0.03

#tree
extra tree: 0.83478449023, 0.03
random forest tree: 0.876310841613, 0.03

nn: 0.844836254246, 0.04

fm: 0.821892502294, 0.03

In [228]:
Wold = 0.5
w_xgb = 1.0 - Wold
w_xgbc = 0.75 * Wold
w_ada = 1 * Wold

w_fm = 0.51 * Wold
w_nn = 0.37 * Wold

w_elas = 0.12 * Wold
w_sgd = 1 * Wold

w_ext = 1 * Wold
w_rft = 0.25 * Wold


x_train_stack = np.concatenate(
                (np.log1p(w_xgb*Cali_bst.reshape(-1,1)), 
                 np.log1p(w_rft*Cali_rf.reshape(-1,1)), 
                 np.log1p(w_xgbc*Cali_xgb.reshape(-1,1)),
                 #np.log1p(w_elas*Cali_elas.reshape(-1,1))
                ), axis=1)

In [235]:
from sklearn.linear_model import LinearRegression
watchlist = [(dtrain, 'train'), (dval, 'eval')]
stack_model = xgb.train(xgb_params, dtrain, num_round, watchlist)

#stack_model = LinearRegression()



#stack_model.fit(x_train_stack, target_val)

[0]	train-rmse:0.460769	eval-rmse:0.456051
[1]	train-rmse:0.426627	eval-rmse:0.416922
[2]	train-rmse:0.396816	eval-rmse:0.382042
[3]	train-rmse:0.370844	eval-rmse:0.350959
[4]	train-rmse:0.348303	eval-rmse:0.323199
[5]	train-rmse:0.328774	eval-rmse:0.298434
[6]	train-rmse:0.311938	eval-rmse:0.27639
[7]	train-rmse:0.29747	eval-rmse:0.256811
[8]	train-rmse:0.285041	eval-rmse:0.239305
[9]	train-rmse:0.274458	eval-rmse:0.223594
[10]	train-rmse:0.265497	eval-rmse:0.209648
[11]	train-rmse:0.257818	eval-rmse:0.197398
[12]	train-rmse:0.251303	eval-rmse:0.186475
[13]	train-rmse:0.245819	eval-rmse:0.176787
[14]	train-rmse:0.241111	eval-rmse:0.168123
[15]	train-rmse:0.237169	eval-rmse:0.160569
[16]	train-rmse:0.233817	eval-rmse:0.153872
[17]	train-rmse:0.230905	eval-rmse:0.147859
[18]	train-rmse:0.228527	eval-rmse:0.142651
[19]	train-rmse:0.226359	eval-rmse:0.137899
[20]	train-rmse:0.22451	eval-rmse:0.133782
[21]	train-rmse:0.223004	eval-rmse:0.130187
[22]	train-rmse:0.22169	eval-rmse:0.126964
[2

AttributeError: 'Booster' object has no attribute 'fit'

In [236]:
#predict_xgbssss = stack_model.predict(X_val)
predict_xgbssss = stack_model.predict(dval)
#Cali_lin = Cali_CTR(linreg_model.predict(X_val_FM), 0.0075)

Cali_stack = Cali_CTR(predict_xgbssss, 0.0075)

In [237]:
predict_xgbssss

array([ 0.01258515,  0.35675454,  0.13332993, ...,  0.07875905,
        0.04661544,  0.01455905], dtype=float32)

In [238]:
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_stack, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_stack)))

auc on test:  0.893795988128
RMSE on test dataset = 0.03


In [162]:
(df_validation["click"]).sum()

226

In [243]:
base = 180 #136
bidp = []
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []
for i in range(30):
    bid_p = pbidprice(Cali_stack, base, 'squared')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bid_p,Payprice_val, clicks)
    print(base)
#     win_number.append(win_num)
#     totalcost.append(total_cost)
#     clicknum.append(click_num)
#     ctr_p.append(ctr)
#     cpc_p.append(cpc)
#     bidp.append(bid_p)
    base += 1

**SQUARED**
win_num 100968.0, total_cost 5874.841000001382, click_num 182.0, ctr 0.00180255130338325, cpc 32.279346153853744
180
**SQUARED**
win_num 101278.0, total_cost 5897.121000001387, click_num 183.0, ctr 0.0018069077193467486, cpc 32.224704918040366
181
**SQUARED**
win_num 101577.0, total_cost 5915.252000001388, click_num 183.0, ctr 0.0018015889423786881, cpc 32.32378142077261
182
**SQUARED**
win_num 101930.0, total_cost 5935.806000001393, click_num 183.0, ctr 0.0017953497498283135, cpc 32.43609836066335
183
**SQUARED**
win_num 102231.0, total_cost 5955.6650000014, click_num 183.0, ctr 0.0017900636793144937, cpc 32.544617486346446
184
**SQUARED**
win_num 102506.0, total_cost 5974.681000001404, click_num 183.0, ctr 0.0017852613505550895, cpc 32.64853005465248
185
**SQUARED**
win_num 102804.0, total_cost 5995.524000001398, click_num 183.0, ctr 0.001780086377961947, cpc 32.76242622951584
186
**SQUARED**
win_num 103108.0, total_cost 6017.0300000014, click_num 183.0, ctr 0.00177483803

In [412]:
#save the submission
#with open('group11_submission.csv','w') as f:
#    f.write('id, payprice\n')
#    for y, x in zip(Bid_id, bidprice_test):
#        f.write('%s,%s\n' % (y, x))

In [90]:
from pyfm import pylibfm

fm = pylibfm.FM(num_factors=20, 
                num_iter=200, 
                verbose=True, 
                task="classification", 
                initial_learning_rate=0.0001, 
                learning_rate_schedule="optimal")
fm.fit(X_FM,target)
pred_fm_test = fm.predict(X_val_FM)
Cali_fm = Cali_CTR(pred_fm_test, 0.0075)
fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_fm, pos_label=1)
auc = metrics.auc(fpr, tpr)

print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_fm)))

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.36794
-- Epoch 2
Training log loss: 0.29948
-- Epoch 3
Training log loss: 0.29091
-- Epoch 4
Training log loss: 0.28470
-- Epoch 5
Training log loss: 0.27944
-- Epoch 6
Training log loss: 0.27479
-- Epoch 7
Training log loss: 0.27063
-- Epoch 8
Training log loss: 0.26683
-- Epoch 9
Training log loss: 0.26331
-- Epoch 10
Training log loss: 0.26005
-- Epoch 11
Training log loss: 0.25701
-- Epoch 12
Training log loss: 0.25414
-- Epoch 13
Training log loss: 0.25147
-- Epoch 14
Training log loss: 0.24893
-- Epoch 15
Training log loss: 0.24650
-- Epoch 16
Training log loss: 0.24423
-- Epoch 17
Training log loss: 0.24203
-- Epoch 18
Training log loss: 0.23995
-- Epoch 19
Training log loss: 0.23794
-- Epoch 20
Training log loss: 0.23603
-- Epoch 21
Training log loss: 0.23417
-- Epoch 22
Training log loss: 0.23235
-- Epoch 23
Training log loss: 0.23071
-- Epoch 24
Training log loss: 0.229

In [91]:
Cali_fm

array([ 0.00010251,  0.00849571,  0.00089773, ...,  0.00058971,
        0.00079036,  0.00041024])

In [107]:
predict_xgbc = xgb_model.predict_proba(X_val_FM)[:,1]

In [108]:
predict_xgb=(bst_model.predict(dval))

In [99]:
y_pred_combine_fm_xgb = 0.1 * pred_fm_test + 0.9* predict_xgb
Cali_combine_fm_xgb = Cali_CTR(y_pred_combine_fm_xgb, 0.0075)

fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_combine_fm_xgb, pos_label=1)
auc = metrics.auc(fpr, tpr)
print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_combine_fm_xgb)))

auc on test:  0.892427100683
RMSE on test dataset = 0.03


In [109]:
y_pred_combine_fm_xgb = 0.1 * predict_xgbc + 0.9* predict_xgb
Cali_combine_fm_xgb = Cali_CTR(y_pred_combine_fm_xgb, 0.0075)

fpr, tpr, thresholds = metrics.roc_curve(target_val, Cali_combine_fm_xgb, pos_label=1)
auc = metrics.auc(fpr, tpr)
print('auc on test: ', auc)
print ("RMSE on test dataset = %.2f" % (rmse(target_val, Cali_combine_fm_xgb)))

auc on test:  0.894163415997
RMSE on test dataset = 0.03


In [110]:
base = 120 #136
bidp = []
win_number = []
totalcost = []
clicknum = []
ctr_p = []
cpc_p = []
for i in range(30):
    bid_p = pbidprice(Cali_combine_fm_xgb, base, 'linear')
    win_num, total_cost, click_num, ctr, cpc = evaluation(bid_p,Payprice_val, clicks)
    print(base)
#     win_number.append(win_num)
#     totalcost.append(total_cost)
#     clicknum.append(click_num)
#     ctr_p.append(ctr)
#     cpc_p.append(cpc)
#     bidp.append(bid_p)
    base += 1

 **LINEAR** 
win_num 119560.0, total_cost 5641.977000001879, click_num 184.0, ctr 0.0015389762462361995, cpc 30.66291847827108
120
 **LINEAR** 
win_num 120273.0, total_cost 5690.921000001902, click_num 185.0, ctr 0.0015381673359773182, cpc 30.761735135145415
121
 **LINEAR** 
win_num 120949.0, total_cost 5735.320000001932, click_num 185.0, ctr 0.001529570314760767, cpc 31.001729729740177
122
 **LINEAR** 
win_num 121671.0, total_cost 5784.547000001968, click_num 186.0, ctr 0.0015287126759868826, cpc 31.09971505377402
123
 **LINEAR** 
win_num 122334.0, total_cost 5829.0050000019855, click_num 186.0, ctr 0.001520427681593016, cpc 31.33873655915046
124
 **LINEAR** 
win_num 123084.0, total_cost 5880.015000002003, click_num 186.0, ctr 0.001511163108121283, cpc 31.61298387097851
125
 **LINEAR** 
win_num 123710.0, total_cost 5924.143000002001, click_num 186.0, ctr 0.001503516288093121, cpc 31.850231182806457
126
 **LINEAR** 
win_num 124404.0, total_cost 5972.6320000020205, click_num 186.0, ctr 