In [1]:
import random
import numpy as np
import pandas as pd
import joblib

In [2]:
advertiser_id = '1458'
path = "/Users/denniscimorosi/Desktop/Tesi/IPinYou/make-ipinyou-data-master/" + advertiser_id + "/"
train_name = advertiser_id + '.train.final.csv'
test_name = advertiser_id + '.test.final.csv'
model = advertiser_id + '.model.joblib'

In [3]:
train_df = pd.read_csv(path + train_name, delimiter=',')
test_df = pd.read_csv(path + test_name, delimiter=',')

In [4]:
# alcuni payprice sono a 0

In [5]:
def fen_to_euro(df):
    for col in df.columns:
        if 'price' in col:
            euro = df[col] * 0.0014 # convert 
            df[col] = round(euro, 2)
    return df

#train_df = fen_to_euro(train_df)
#test_df = fen_to_euro(test_df)

In [6]:
# load model
CTR_model = joblib.load(path + model)

In [7]:
class BudgetExtinguished(Exception):
    pass

class EvaluationFramework:
    def __init__(self, train_df, model):
        # 0: initialise the bidding engine
        self.budget = 0
        self.clicks = 0 # KPI
        self.cost = 0 # amount spent so far
        self.wins = 0
        self.bids = [] 
        self.model = model
        self.budget_done = False
        # max eCPC = campaign cost/clicks obtained
        train_cost = train_df['payprice'].sum()
        train_clicks = train_df['click'].sum()
        #quanto mi è costato ogni click in media rispetto ai miei dati storici
        self.max_eCPC = round(train_cost /train_clicks, 2) 

        self.bids_count = 0 # keep track of the amount of bids elaborated 
    
    def _show_stats(self, stats):
        for key in stats:
            stat = stats[key]
            is_float = type(stat) is float or type(stat) is np.float64
            if is_float: stat = round(stat, 4)
            print(key, ":", stat)

    def _predict_CTR(self, sample):
        # model prediction
        probs = self.model.predict_proba(sample)
        pCTR = probs[0,1] # pCTR mean: 0.0008
        return pCTR

    def _bidding_function(self, sample, max_eCPC):
        sample = pd.DataFrame(sample, index=[0]) # from dict back to pandas record (see evaluate method)
        pCTR = self._predict_CTR(sample)
        return pCTR * max_eCPC

    def _update_stats_on_win(self, price, click_feedback):
        self.wins += 1
        self.cost += price
        self.clicks += click_feedback

    def _record_evaluation(self, bid_request, pay_price, floor_price, click_feedback, verbose):
        # 1: bid request 
        bid = self._bidding_function(bid_request, self.max_eCPC) 
        self.bids.append(bid)

        # 2: bid price for this bid request
        if  self.cost > self.budget: 
            self.budget_done = True
            raise BudgetExtinguished

        else:
          # 3: auction winning checking
          win = (bid > floor_price) and (bid > pay_price)
          # 4: win notice, the charged price & user feedback if win
          if(win): self._update_stats_on_win(pay_price, click_feedback) 

        # keep track of the amount of bids elaborated during eval
        self.bids_count += 1
        if verbose:
            if self.bids_count % 100000 == 0:
                print(self.bids_count)
      
    def evaluate(self, test_df, verbose=False, budget_fraction=1):
        # budget initialization: amount spendt in real scenario * budget_fraction
        self.budget = (test_df['payprice'].sum()) * budget_fraction
        self.budget_done = False
        n_records = test_df.shape[0]

        # sort the bid requests in cronological order
        test_df.sort_values('timestamp', inplace=True)

        # extract bid request data
        domain_cols = [x for x in test_df.columns if 'domain' in x] # select all the domain columns (more than one if one-one was used)
        os_cols = [x for x in test_df.columns if 'os' in x] # same as domain
        browser_cols = [x for x in test_df.columns if 'browser' in x] # same as domain
        city_cols = [x for x in test_df.columns if 'city' in x] # same as city
        region_cols = [x for x in test_df.columns if 'region' in x] # same as region
        other_cols = ['weekday', 'hour']
        # the order of the columns must be: click weekday hour useragent region city domain
        bid_request = test_df[other_cols + os_cols + browser_cols + domain_cols + city_cols + region_cols]
        bid_request = bid_request.to_dict('records') # to work with more columns during vectorization we need each record in dict format

        # extract auction winning price and feedback (i.e. click)
        pay_price = test_df['payprice'] # i.e. market price, auction winning price
        floor_price = test_df['slotprice']
        click_feedback = test_df['click']

        # NOTE: record evaluation modifies the stats duiring execution
        vfun = np.vectorize(self._record_evaluation)
        try: 
            vfun(bid_request, pay_price, floor_price, click_feedback, verbose)
        except BudgetExtinguished:
            print('budget extinguished')

        # output
        stats = {
            'click': self.clicks,
            'wins': self.wins,
            'win ratio ': self.wins/n_records,
            'CTR': 0 if self.wins < 1 else self.clicks/self.wins,
            'bid mean': sum(self.bids)/n_records,
            'budget extinguished': self.budget_done,
            'CPC': 0 if self.clicks < 1 else self.cost/self.clicks
        }
        if verbose:
            self._show_stats(stats)

        return stats

In [8]:
evaluation= EvaluationFramework(train_df, CTR_model)
evaluation.evaluate(test_df, verbose=True, budget_fraction = 1/2)

100000
200000
300000
400000
500000
600000
click : 257
wins : 281749
win ratio  : 0.4584
CTR : 0.0009
bid mean : 67.0075
budget extinguished : False
CPC : 50312.8249


{'click': 257,
 'wins': 281749,
 'win ratio ': 0.45839827670921746,
 'CTR': 0.000912159404292473,
 'bid mean': 67.00752644764425,
 'budget extinguished': False,
 'CPC': 50312.82490272373}