Для данной задачи вам предстоит построить рекомендательную систему баннеров на основе логов просмотров и лайков.

Логи представлены четырьмя колонками:

user_id (идентификатор пользователя),<br />
item_id (идентификатор баннера),<br />
like (флаг понравился ли пользователю баннер),<br />
timestamp (unix время в секундах совершения действия).

Кроме того, для пользователей и баннеров имеются признаки размерностью 32.

Вам необходимо предсказать 20 баннеров для пользователей. Качество решения будет оцениваться как доля "лайкнутых" пользователей баннеров из предложенного вами списка (top-20 accuracy).

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import f_classif, mutual_info_classif

In [4]:
import warnings
warnings.filterwarnings("ignore")

### Useful functions

In [5]:
def get_boxplot(column):
    
    # plot boxplots for numerical features
    
    fig, ax = plt.subplots(figsize=(14, 4))
    sns.boxplot(x='1', y=column,
                data=data,
                ax=ax, showmeans=True)
    plt.xticks(rotation=45)
    ax.set_title('Boxplot for ' + column)
    plt.show()
    
def vis_importance(xcol, ycol):
    
    # plot histograms for categorical features to visualize
    # dependence on target variable
    # xcol - feature to explore, ycol - target variable
    
    pd.crosstab(data[xcol], data[ycol]).plot(kind='bar')
    plt.title('Default frequency for {}'.format(xcol))
    plt.xlabel(xcol)
    plt.ylabel('Frequency of default')
    plt.show()
    
def outliers_detection(data, col):
    
    # detect outliers in data[col]
    
    perc25 = data[col].quantile(0.25)
    perc75 = data[col].quantile(0.75)
    IQR = perc75 - perc25
    left = perc25 - 1.5 * IQR
    right = perc75 + 1.5 * IQR
    print('min: {0}, max: {1}'.format(data[col].min(), data[col].max()))
    print('outlier boundaries: [{0}, {1}]'.format(left, right))
    if len(data[~data[col].between(left, right)]) > 0:
        print('ratio of outliers: {}'.format(len(data[(~data[col].between(left, right))&(~data[col].isna())])/len(data)))

def drop_outliers(data, col):
    
    # detect outliers in data[col]
    
    perc25 = data[col].quantile(0.25)
    perc75 = data[col].quantile(0.75)
    IQR = perc75 - perc25
    return data[(data[col].between(perc25 - 1.5 * IQR, perc75 + 1.5 * IQR))|(data[col].isna())]

### Load data

In [63]:
train = pd.read_csv('input/train_2.csv')
test = pd.read_csv('input/test_2.csv')
user_features = pd.read_csv('input/user-features_2.csv')
item_features = pd.read_csv('input/item-features_2.csv')

Описание файлов

test.csv — тестовый файл, содержащий идентификаторы пользователи, для которых необходимо сделать предсказания

train.csv — обучающий датасет

item-features.csv — признаки для баннеров

user-features — признаки для пользователей

sample-submission.csv — пример решения (сабмита).

In [7]:
train.shape, test.shape, user_features.shape, item_features.shape

((8674, 4), (497, 2), (497, 33), (444, 33))

In [87]:
test.info(), train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   user_id    497 non-null    int64
 1   timestamp  497 non-null    int64
dtypes: int64(2)
memory usage: 7.9 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8674 entries, 0 to 8673
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   user_id    8674 non-null   int64
 1   item_id    8674 non-null   int64
 2   like       8674 non-null   int64
 3   timestamp  8674 non-null   int64
dtypes: int64(4)
memory usage: 271.2 KB


(None, None)

### Data Analize

In [64]:
train_full = train.merge(user_features, how='left', left_on='user_id', right_on='user_id')
train_full = train_full.merge(item_features, how='left', left_on='item_id', right_on='item_id')

In [65]:
train_full.sample(5)

Unnamed: 0,user_id,item_id,like,timestamp,0_x,1_x,2_x,3_x,4_x,5_x,...,22_y,23_y,24_y,25_y,26_y,27_y,28_y,29_y,30_y,31_y
2337,203,123,0,1491009229,0.001592,-0.003604,-0.003368,0.004778,-0.001141,0.00157,...,-0.008651,-0.00144,0.002312,-0.002225,-0.004108,-0.004108,0.000871,-0.002408,-0.002408,0.000613
2819,481,5,1,1491023907,0.001738,-0.003933,-0.003675,0.005213,-0.001244,0.001713,...,-0.01652,-0.002749,0.004415,-0.00425,-0.007845,-0.007845,0.001663,-0.004599,-0.004599,0.001171
5808,205,143,0,1491119032,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,...,-0.005551,-0.000924,0.001484,-0.001428,-0.002636,-0.002636,0.000559,-0.001545,-0.001545,0.000393
170,328,2,0,1490941634,0.004298,-0.009729,-0.009091,0.012897,-0.003078,0.004237,...,-0.011299,-0.00188,0.00302,-0.002907,-0.005365,-0.005365,0.001137,-0.003145,-0.003145,0.000801
7716,34,12,0,1491180902,0.000491,-0.001112,-0.001039,0.001475,-0.000352,0.000484,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.00021


### Feature Engineers

In [67]:
# convert to datetime

train_full['created'] = pd.to_datetime(train_full['timestamp'],unit='s')

In [88]:
# create new date features

train_full['created_month'] = train_full['created'].dt.month
train_full['created_day'] = train_full['created'].dt.day
train_full['created_dayofweek'] = train_full['created'].dt.dayofweek
train_full['created_hour'] = train_full['created'].dt.round('H').dt.hour

def f(x):
    if (x > 4) and (x <= 8):
        return 'Early Morning'
    elif (x > 8) and (x <= 12 ):
        return 'Morning'
    elif (x > 12) and (x <= 16):
        return'Noon'
    elif (x > 16) and (x <= 20) :
        return 'Eve'
    elif (x > 20) and (x <= 24):
        return'Night'
    elif (x <= 4):
        return'Late Night'
    
    
train_full['part_of_day'] = train_full['created_hour'].apply(f)

train_full = train_full.drop(['created', 'timestamp'], axis=1)

In [89]:
train_full.sample(5)

Unnamed: 0,user_id,item_id,like,0_x,1_x,2_x,3_x,4_x,5_x,6_x,...,27_y,28_y,29_y,30_y,31_y,created_month,created_day,created_dayofweek,created_hour,part_of_day
5080,13,96,0,0.002802,-0.006341,-0.005925,0.008406,-0.002007,0.002762,0.003268,...,-0.003151,0.000668,-0.001847,-0.001847,0.00047,4,2,6,1,Late Night
2481,183,100,0,0.000919,-0.002081,-0.001944,0.002759,-0.000658,0.000906,0.001073,...,-0.003451,0.000732,-0.002023,-0.002023,0.000515,4,1,5,2,Late Night
3943,4,248,0,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,0.000811,...,-0.001993,0.000422,-0.001168,-0.001168,0.000297,4,1,5,15,Noon
6713,235,403,0,0.000777,-0.001759,-0.001643,0.002332,-0.000557,0.000766,0.000906,...,-0.001409,0.000299,-0.000826,-0.000826,0.00021,4,2,6,16,Noon
3432,14,287,0,0.000491,-0.001112,-0.001039,0.001475,-0.000352,0.000484,0.000573,...,-0.032385,-0.018243,-0.04696,-0.04696,0.025411,4,1,5,10,Morning


In [90]:
train_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8674 entries, 0 to 8673
Data columns (total 72 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   user_id            8674 non-null   int64  
 1   item_id            8674 non-null   int64  
 2   like               8674 non-null   int64  
 3   0_x                8674 non-null   float64
 4   1_x                8674 non-null   float64
 5   2_x                8674 non-null   float64
 6   3_x                8674 non-null   float64
 7   4_x                8674 non-null   float64
 8   5_x                8674 non-null   float64
 9   6_x                8674 non-null   float64
 10  7_x                8674 non-null   float64
 11  8_x                8674 non-null   float64
 12  9_x                8674 non-null   float64
 13  10_x               8674 non-null   float64
 14  11_x               8674 non-null   float64
 15  12_x               8674 non-null   float64
 16  13_x               8674 

In [None]:
# columns by types

num_col = ['user_id', 'item_id', '0_x', '1_x',
          '2_x', '3_x', '4_x', '5_x', '6_x', '7_x',
          '8_x', '9_x', '10_x', '11_x', '12_x', '13_x',
          '14_x', '15_x', '16_x', '17_x', '18_x', '19_x',
          '21_x', '22_x', '23_x', '24_x', '25_x', '26_x',
          '27_x', '28_x', '29_x', '30_x', '31_x', '0_y', '1_y',
          '2_y', '3_y', '4_y', '5_y', '6_y', '7_y',
          '8_y', '9_y', '10_y', '11_y', '12_y', '13_y',
          '14_y', '15_y', '16_y', '17_y', '18_y', '19_y',
          '21_y', '22_y', '23_y', '24_y', '25_y', '26_y',
          '27_y', '28_y', '29_y', '30_y', '31_y']
cat_col = ['part_of_day', 'created_hour', 'created_dayofweek', 'created_day', 'created_month']
bin_col = ['like']

In [None]:
# Качество решения оценивается по метрике Top-K Accuracy, где k = 20. Код:

def calc_score(test_choices, pred_choices, tk):
    s = 0
    for gt, p in zip(test_choices, pred_choices):
        s += int(gt in p)
    score = s / len(test_choices)
    return score