In [2]:
!pip install catboost
from catboost import CatBoostRegressor
import pandas as pd
import numpy as np

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [102]:
users = pd.read_csv('https://drive.google.com/uc?id=1ManubAgRwSpGgVK9ac3e3spRfcrTROR7')
users.set_index('user_id')
users.head(3)

Unnamed: 0,user_id,sex,age,city_id
0,0,2,19,0
1,1,1,0,1
2,2,2,24,2


In [103]:
ad_views = pd.read_csv('https://drive.google.com/uc?id=1qkJccaSEwmwlJg6ubFe-evSwvq1FV5P9')
ad_views.head(3)

Unnamed: 0,hour,cpm,publisher,user_id
0,10,30.0,1,15661
1,8,41.26,1,8444
2,7,360.0,1,15821


In [104]:
def cpm_by_id(_id):
  return ad_views.loc[ad_views['user_id'] == _id, 'cpm'].mean()

def ads_by_id(_id):
  return len(ad_views[ad_views['user_id'] == _id])

users['mean_cpm'] = users['user_id'].apply(cpm_by_id)
users['ads_showed'] = users['user_id'].apply(ads_by_id)
users = users.fillna(users.mean())
users.head()

Unnamed: 0,user_id,sex,age,city_id,mean_cpm,ads_showed
0,0,2,19,0,2053.83,2
1,1,1,0,1,124.068049,82
2,2,2,24,2,274.701224,0
3,3,1,20,3,370.88,8
4,4,2,29,4,44.627955,132


In [105]:
def cpm_by_ids(ids):
  _ids = list(map(int, ids.split(',')))
  return users.loc[users['user_id'].isin(_ids), 'mean_cpm'].mean()

def ads_by_ids(ids):
  _ids = list(map(int, ids.split(',')))
  return users.loc[users['user_id'].isin(_ids), 'ads_showed'].mean()

X = pd.read_csv('https://drive.google.com/uc?id=151x4xSe1VMjUkKBtpsY8RYCxXnTQF8UE')
X['duration'] = X['hour_end'] - X['hour_start']
X['mean_user_cpm'] = X['user_ids'].apply(cpm_by_ids)
X['mean_user_ads'] = X['user_ids'].apply(ads_by_ids)
X = X.set_index('ad_id')
X = X.drop(['hour_start', 'hour_end', 'publishers', 'user_ids'], axis=1)
X_train, X_rest = X[:700], X[700:]
X.head()

Unnamed: 0_level_0,cpm,audience_size,duration,mean_user_cpm,mean_user_ads
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,220.0,1906,95,293.479033,40.567156
1,312.0,1380,6,179.636671,128.99058
2,70.0,888,20,302.581066,43.977477
3,240.0,440,82,269.411238,38.665909
4,262.0,1476,238,354.247404,35.52439


In [106]:
y = pd.read_csv('https://drive.google.com/uc?id=1HCLLCz6RkzqJutmYf_h98aLb1--Bdb2T')
y = y.set_index('ad_id')
y.head()

Unnamed: 0_level_0,at_least_one
ad_id,Unnamed: 1_level_1
0,0.043
1,0.013
2,0.0878
3,0.2295
4,0.3963


In [118]:
model = CatBoostRegressor()
# Fit model
model.fit(X_train, y)
# Get predictions
preds = model.predict(X_rest)

Learning rate set to 0.038699
0:	learn: 0.1444579	total: 1.12ms	remaining: 1.12s
1:	learn: 0.1420976	total: 2.36ms	remaining: 1.18s
2:	learn: 0.1400663	total: 3.53ms	remaining: 1.17s
3:	learn: 0.1381898	total: 4.68ms	remaining: 1.17s
4:	learn: 0.1362782	total: 5.89ms	remaining: 1.17s
5:	learn: 0.1342770	total: 7.04ms	remaining: 1.17s
6:	learn: 0.1325793	total: 8.21ms	remaining: 1.17s
7:	learn: 0.1307607	total: 9.41ms	remaining: 1.17s
8:	learn: 0.1290802	total: 10.6ms	remaining: 1.17s
9:	learn: 0.1273179	total: 11.8ms	remaining: 1.17s
10:	learn: 0.1257648	total: 13ms	remaining: 1.17s
11:	learn: 0.1243203	total: 14.2ms	remaining: 1.17s
12:	learn: 0.1232018	total: 15.4ms	remaining: 1.17s
13:	learn: 0.1219454	total: 16.5ms	remaining: 1.16s
14:	learn: 0.1206999	total: 17.7ms	remaining: 1.16s
15:	learn: 0.1195925	total: 18.8ms	remaining: 1.16s
16:	learn: 0.1182803	total: 20ms	remaining: 1.16s
17:	learn: 0.1171260	total: 21.2ms	remaining: 1.16s
18:	learn: 0.1160464	total: 22.4ms	remaining: 1.

In [120]:
solution = {'ad_id': [i for i in range(700, 1008)], 'at_least_one': preds}
df_solution = pd.DataFrame(solution)
df_solution.tail()

Unnamed: 0,ad_id,at_least_one
303,1003,0.107381
304,1004,0.033781
305,1005,0.073661
306,1006,0.219258
307,1007,0.043138


In [122]:
df_solution.to_csv ('solution.csv', index = False, header=True)