In [1]:
import random
import numpy as np
import pandas as pd
import joblib

In [2]:
advertiser_id = 'all'
path = "/Users/denniscimorosi/Desktop/Tesi/IPinYou/make-ipinyou-data-master/" + advertiser_id + "/"
train_name = 'train.final.csv'
test_name = 'test.final.csv'
model = 'model.joblib'
advertisers_list = ["1458", "2259", "2261", "2821", "2997", "3358", "3386", "3427", "3476"]

In [3]:
train_df = pd.read_csv(path + train_name, delimiter=',')
test_df = pd.read_csv(path + test_name, delimiter=',')

In [4]:
train_df.columns

Index(['click', 'weekday', 'hour', 'timestamp', 'slotprice', 'bidprice',
       'payprice', 'advertiser_0', 'advertiser_1', 'advertiser_2',
       'advertiser_3', 'advertiser_4', 'advertiser_5', 'advertiser_6',
       'advertiser_7', 'advertiser_8', 'os_0', 'os_1', 'os_2', 'os_3', 'os_4',
       'os_5', 'browser_0', 'browser_1', 'browser_2', 'browser_3', 'browser_4',
       'browser_5', 'browser_6', 'browser_7', 'browser_8', 'domain_0',
       'domain_1', 'domain_2', 'domain_3', 'domain_4', 'domain_5', 'domain_6',
       'domain_7', 'domain_8', 'domain_9', 'domain_10', 'domain_11',
       'domain_12', 'domain_13', 'domain_14', 'domain_15', 'domain_16',
       'domain_17', 'domain_18', 'domain_19', 'city_0', 'city_1', 'city_2',
       'city_3', 'city_4', 'city_5', 'city_6', 'city_7', 'city_8', 'city_9',
       'city_10', 'city_11', 'city_12', 'city_13', 'city_14', 'city_15',
       'city_16', 'region_0', 'region_1', 'region_2', 'region_3', 'region_4',
       'region_5', 'region_6', 'reg

In [5]:
#rename advertiser columns
train_df.rename(columns = {'advertiser_0':'advertiser_1458', 'advertiser_1':'advertiser_2259',
                     'advertiser_2':'advertiser_2261', 'advertiser_3':'advertiser_2821',
                     'advertiser_4':'advertiser_2997', 'advertiser_5':'advertiser_3358',
                     'advertiser_6':'advertiser_3386', 'advertiser_7':'advertiser_3427',
                     'advertiser_8':'advertiser_3476'}, inplace = True)

test_df.rename(columns = {'advertiser_0':'advertiser_1458', 'advertiser_1':'advertiser_2259',
                     'advertiser_2':'advertiser_2261', 'advertiser_3':'advertiser_2821',
                     'advertiser_4':'advertiser_2997', 'advertiser_5':'advertiser_3358',
                     'advertiser_6':'advertiser_3386', 'advertiser_7':'advertiser_3427',
                     'advertiser_8':'advertiser_3476'}, inplace = True)

In [6]:
# alcuni payprice sono a 0

In [7]:
def fen_to_euro(df):
    for col in df.columns:
        if 'price' in col:
            euro = df[col] * 0.0014 # convert 
            df[col] = round(euro, 2)
    return df

#train_df = fen_to_euro(train_df)
#test_df = fen_to_euro(test_df)

In [8]:
# load model
CTR_model = joblib.load(path + model)

In [10]:
class BudgetExtinguished(Exception):
    pass

class EvaluationFramework:
    def __init__(self, train_df, model, advertiser):
        # 0: initialise the bidding engine
        self.budget = 0
        self.clicks = 0 # KPI
        self.cost = 0 # amount spent so far
        self.wins = 0
        self.bids = [] 
        self.model = model
        self.budget_done = False
        self.advertiser = advertiser
        self.train_df = train_df[train_df['advertiser_' + self.advertiser] == 1]
        # max eCPC = campaign cost/clicks obtained
        train_cost = self.train_df['payprice'].sum()
        train_clicks = self.train_df['click'].sum()
        self.max_eCPC = round(train_cost /train_clicks, 2)

        self.bids_count = 0 # keep track of the amount of bids elaborated 
    
    def _show_stats(self, stats):
        for key in stats:
            stat = stats[key]
            is_float = type(stat) is float or type(stat) is np.float64
            if is_float: stat = round(stat, 4)
            print(key, ":", stat)

    def _predict_CTR(self, sample):
        # model prediction
        probs = self.model.predict_proba(sample)
        pCTR = probs[0,1] # pCTR mean: 0.0008
        return pCTR

    def _bidding_function(self, sample, max_eCPC):
        sample = pd.DataFrame(sample, index=[0]) # from dict back to pandas record (see evaluate method)
        pCTR = self._predict_CTR(sample)
        return pCTR * max_eCPC

    def _update_stats_on_win(self, price, click_feedback):
        self.wins += 1
        self.cost += price
        self.clicks += click_feedback

    def _record_evaluation(self, bid_request, pay_price, floor_price, click_feedback, verbose):
        # 1: bid request 
        bid = self._bidding_function(bid_request, self.max_eCPC) 
        self.bids.append(bid)

        # 2: bid price for this bid request
        if  self.cost > self.budget: 
            self.budget_done = True
            raise BudgetExtinguished

        else:
          # 3: auction winning checking
          win = (bid > floor_price) and (bid > pay_price)
          # 4: win notice, the charged price & user feedback if win
          if(win): self._update_stats_on_win(pay_price, click_feedback) 

        # keep track of the amount of bids elaborated during eval
        self.bids_count += 1
        if verbose:
            if self.bids_count % 100000 == 0:
                print(self.bids_count)
      
    def evaluate(self, test_df, verbose=False, budget_fraction=1):
        #select only bid of the advertiser in input
        test_df = test_df[test_df['advertiser_' + self.advertiser] == 1]
        
        # budget initialization: amount spendt in real scenario * budget_fraction
        self.budget = (test_df['payprice'].sum()) * budget_fraction
        self.budget_done = False
        n_records = test_df.shape[0]
        
        
        # sort the bid requests in cronological order
        test_df.sort_values('timestamp', inplace=True)
        

        # extract bid request data
        domain_cols = [x for x in test_df.columns if 'domain' in x] # select all the domain columns (more than one if one-one was used)
        os_cols = [x for x in test_df.columns if 'os' in x] # same as domain
        browser_cols = [x for x in test_df.columns if 'browser' in x] # same as browser
        city_cols = [x for x in test_df.columns if 'city' in x] # same as city
        region_cols = [x for x in test_df.columns if 'region' in x] # same as region
        other_cols = ['weekday', 'hour']
        advertisers_cols = ["advertiser_1458", "advertiser_2259", "advertiser_2261", "advertiser_2821",
                            "advertiser_2997", "advertiser_3358", "advertiser_3386", "advertiser_3427",
                            "advertiser_3476"]
        # the order of the columns must be: click weekday hour useragent region city domain
        bid_request = test_df[other_cols + advertisers_cols + os_cols + browser_cols + domain_cols + city_cols + region_cols]
        bid_request = bid_request.to_dict('records') # to work with more columns during vectorization we need each record in dict format

        # extract auction winning price and feedback (i.e. click)
        pay_price = test_df['payprice'] # i.e. market price, auction winning price
        floor_price = test_df['slotprice']
        click_feedback = test_df['click']

        # NOTE: record evaluation modifies the stats duiring execution
        vfun = np.vectorize(self._record_evaluation)
        try: 
            vfun(bid_request, pay_price, floor_price, click_feedback, verbose)
        except BudgetExtinguished:
            print('budget extinguished')

        # output
        stats = {
            'advertiser': self.advertiser,
            'budget': budget_fraction,
            'click': self.clicks,
            'wins': self.wins,
            'win ratio ': self.wins/n_records,
            'CTR': 0 if self.wins < 1 else self.clicks/self.wins,
            'bid mean': sum(self.bids)/n_records,
            'budget extinguished': self.budget_done,
            'CPC': 0 if self.clicks < 1 else self.cost/self.clicks
        }
        if verbose:
            self._show_stats(stats)

        return stats

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
for advertiser in advertisers_list:
    for budget_fraction in [1/32, 1/8, 1/2]:
        evaluation= EvaluationFramework(train_df, CTR_model, advertiser= advertiser)
        evaluation.evaluate(test_df, verbose=True , budget_fraction = budget_fraction)
        

budget extinguished
advertiser : 1458
budget : 0.0312
click : 26
wins : 37790
win ratio  : 0.0615
CTR : 0.0007
bid mean : 8.3146
budget extinguished : True
CPC : 54347.1923
100000
200000
budget extinguished
advertiser : 1458
budget : 0.125
click : 116
wins : 141560
win ratio  : 0.2303
CTR : 0.0008
bid mean : 31.7306
budget extinguished : True
CPC : 48725.181
100000
200000
300000
400000
500000
600000
advertiser : 1458
budget : 0.5
click : 243
wins : 298753
win ratio  : 0.4861
CTR : 0.0008
bid mean : 66.4595
budget extinguished : False
CPC : 51648.823
budget extinguished
advertiser : 2259
budget : 0.0312
click : 4
wins : 33393
win ratio  : 0.08
CTR : 0.0001
bid mean : 13.3611
budget extinguished : True
CPC : 339827.5
100000
200000
budget extinguished
advertiser : 2259
budget : 0.125
click : 26
wins : 119065
win ratio  : 0.2854
CTR : 0.0002
bid mean : 52.1073
budget extinguished : True
CPC : 209129.0769
100000
200000
300000
400000
advertiser : 2259
budget : 0.5
click : 49
wins : 206564
wi