# <center> Advertising Exposure Estimation




## Content
* [Part 0: Loading](#Part-0-Loading)
* [Part 1: Preprocessing](#Part-1-Preprocessing)
* [Part 2: Feature Engineering](#Part-2-Feature-Engineering)
* [Part 3: Modeling](#Part-3-Modeling)
* [Part 4: Model Evaluation](#Part-4-Model-Evaluation)

## Part 0-Loading

In [2]:
# load packages
%matplotlib inline
import os
import pandas as pd
import numpy as np
import random
import gc
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.random.seed(2020)
random.seed(2020)


### load dataset

In [3]:
# load exposure log dataset
df=pd.read_csv('totalExposureLog.out', sep='\t',names=['id','request_timestamp','position','uid','aid','imp_ad_size','bid','pctr','quality_ecpm','totalEcpm']).sort_values(by='request_timestamp')

# convert data type
df[['id','request_timestamp','position','uid','aid','imp_ad_size']] = df[['id','request_timestamp','position','uid','aid','imp_ad_size']].astype(int)
df[['bid','pctr','quality_ecpm','totalEcpm']] = df[['bid','pctr','quality_ecpm','totalEcpm']].astype(float) 

# serialize data
df.to_pickle('totalExposureLog.pkl') 

del df
gc.collect()

OSError: [Errno 28] No space left on device

In [None]:
# load ad_static_feature dataset
df = pd.read_csv('ad_static_feature.out', sep='\t', names=['aid','create_timestamp','advertiser','good_id','good_type','ad_type_id','ad_size']).sort_values(by='create_timestamp')
df = df.fillna(-1)
for f in ['aid','create_timestamp','advertiser','good_id','good_type','ad_type_id']:
    items=[]
    for item in df[f].values:
        try:
            items.append(int(item))
        except:
            items.append(-1)
    df[f] = items
    df[f] = df[f].astype(int)
df['ad_size'] = df['ad_size'].apply(lambda x:' '.join([str(int(float(y))) for y in str(x).split(',')]))    
df.to_pickle('ad_static_feature.pkl')
del df
gc.collect()

In [None]:
# load user_data
df = pd.read_csv('user_data', sep='\t', 
              names=['uid','age','gender','area','status','education','concuptionAbility','os','work','connectionType','behavior'])
df = df.fillna(-1)
df[['uid','age','gender','education','consuptionAbility','os','connectionType']]=df[['uid','age','gender','education','concuptionAbility','os','connectionType']].astype(int)
for f in ['area','status','work','behavior']:
    df[f] = df[f].apply(lambda x:' '.join(x.split(',')))
df.to_pickle('user_data.pkl')
del df
gc.collect()

In [None]:
# load test_sample
df = pd.read_csv('test_sample.dat', sep='\t', names=['id','aid','create_timestamp','ad_size','ad_type_id','good_type','good_id','advertiser','delivery_periods','crowd_direction','bid'])
df = df.fillna(-1)
df[['id','aid','create_timestamp','ad_size','ad_type_id','good_type','good_id','advertiser']] = df[['id','aid','create_timestamp','ad_size','ad_type_id','good_type','good_id','advertiser']].astype(int)
df['bid'] = df['bid'].astype(float)
df.to_pickle('test_sample.pkl')
del df
gc.collect()

In [None]:
# load ad_operation dataset
aids = []
with open('data/testA/ad_operation.dat','r') as f:
    for line in f:
        line=line.strip().split('\t')
        try:
            if line[1]=='20190230000000':
                line[1]='20190301000000'
            if line[1]!='0':
                request_day=time.mktime(time.strptime(line[1], '%Y%m%d%H%M%S'))//(3600*24)
            else:
                request_day=0
        except:
            print(line[1])

        if len(aids)==0:
            aids.append([int(line[0]),0,"NaN","NaN"])
        elif aids[-1][0]!=int(line[0]):
            for i in range(max(17930,aids[-1][1]+1),17975):
                aids.append(aids[-1].copy())
                aids[-1][1]=i
            aids.append([int(line[0]),0,"NaN","NaN"])               
        elif request_day!=aids[-1][1]:
            for i in range(max(17930,aids[-1][1]+1),int(request_day)):
                aids.append(aids[-1].copy())
                aids[-1][1]=i                
            aids.append(aids[-1].copy())
            aids[-1][1]=int(request_day)
        if line[3]=='3':
            aids[-1][2]=line[4]
        if line[3]=='4':
            aids[-1][3]=line[4]
ad_df = pd.DataFrame(aids)
ad_df.columns = ['aid','request_day','crowd_direction','delivery_periods']

## Part 1-Preprocessing

In [None]:
# preprocessing dataset, split dataset into train dataset and validation dataset
train_df = pd.read_pickle('totalExposureLog.pkl')
train_df['request_day'] = train_df['request_timestamp'] // (3600*24)
wday = []
hour = []
minute = []
for x in tqdm(train_df['request_timestamp'].values,total = len(train_df)):
    localtime = time.localtime(x)
    wday.append(localtime[6])
    hour.append(localtime[3])
    minute.append(localtime[4])
train_df['wday'] = wday
train_df['hour'] = hour
train_df['minute'] = minute
train_df['period_id'] = train_df['hour'] * 2 + train_df['minute'] // 30
dev_df = train_df[train_df['request_day'] == 17974]
del dev_df['period_id']
del dev_df['minute']
del dev_df['hour']
log = train_df
tmp = pd.DataFrame(train_df.groupby(['aid','request_day']).size()).reset_index()
tmp.columns = ['aid','request_day','imp']
log = log.merge(tmp,on=['aid','request_day'],how='left')
log[log['request_day']<17973].to_pickle('user_log_dev.pkl')
log.to_pickle('user_log_test.pkl')
del log
del tmp
gc.collect()
del train_df['period_id']
del train_df['minute']
del train_df['hour']

In [None]:
# preprocessing train dataset


## Part 2-Feature Engineering

## Part 3-Modeling

In [22]:
a = 1534236469
if -2 ** 31 <= a <= 2 ** 31 - 1:
    print('hhh')

hhh


## Part 4-Model Evaluation

In [19]:
int(5 // 2)

2

In [26]:
s = 'IV'
roman_dict = {'I': 1, 'V': 5, 'X': 10, 'L': 50,
              'C': 100, 'D': 500, 'M': 1000}
roman = list(map(roman_dict.get, [i for i in s]))
roman

[1, 5]


In [36]:
ret = 0
for i in range(len(roman) - 1):
    if roman[i] < roman[i+1]:
        ret -= roman[i]
    else:
        ret += roman[i]
ret += roman[i+1]
ret

4

In [38]:
n, m = divmod(4, 4)
print(n, m)

1 0


In [47]:
roman_dict = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000,
              'IV': 4, 'IX': 9, 'XL': 40, 'XC': 90, 'CD': 400, 'CM': 900}
a = 1994
ret = []
mod = []
for k, v in roman_dict.items():
    ret.append(a // v)
    mod.append(a % v)
print(ret)
print(mod)

1994
[0, 4, 4, 44, 94, 494, 994, 2, 5, 34, 14, 394, 194]


In [64]:
a = 'flow'
b = 'fl'
a.find(b)

0