# Util functions

In [2]:
import datetime as dt

import pandas as pd
import numpy as np
np.set_printoptions(precision=2)

### Read data

In [1]:
def create_dfs(sample_data=None, ret_test=True, clean=False):
    """
    Read and joins dataset, split it into train and test and format test to content viewed by account_id
    """
    if clean:
        df = pd.read_csv('data/data_cleaned_joined.csv', parse_dates=['tunein', 'end_vod_date'])
    else:
        df_train = pd.read_csv('data/train_cleaned.csv', parse_dates=['tunein', 'tuneout'])
        df_meta = pd.read_csv('data/metadata_cleaned.csv', parse_dates=['end_vod_date'])

        df = pd.merge(df_train, df_meta, how='left', on='asset_id')

    if sample_data is not None:
        df = df.sample(n=sample_data)

    if ret_test:
        filter_date = dt.datetime(2021, 3, 10, 0, 0, 0).date()
        df_test = df[df.tunein.dt.date > filter_date].copy()
        df = df[df.tunein.dt.date < filter_date]
        df_test = df_test.groupby(['account_id'])['content_id'].agg(lambda X: X.value_counts().index.values.tolist())
    else:
        df_test = None
    
    return df, df_test

### Write results

In [6]:
def write_submit(submit, file=None, check_diff=True):
    """
    xx
    """
    assert len(submit[submit.map(len) != 20]) == 0, "Row with non 20 recomendations"
    if check_diff:
        assert len(submit[submit.map(lambda x: len(set(x)) != 20)]) == 0, "Row with repeated recomendations"

    submit = submit.map(lambda xs: [int(i) for i in xs])
    if file is None:
        submit.to_csv('submit.csv', header=False)
    else:
        submit.to_csv(file, header=False)

### Mean Average Precision

In [3]:
def avg_precision(y_true, y_pred):
    """
    calculate average precision from pandas series y_true and y_pred
    """

    def __get_ap(y_true_, y_pred_):
        positions = [i+1 for i, pred in enumerate(y_pred_) if pred in y_true_]
        if positions:
            return sum([(i+1) / pos for i, pos in enumerate(positions)]) / len(positions)
        return 0
    
    y_true.name = "y_true"
    y_pred.name = "y_pred"
    df_preds = pd.merge(y_true, y_pred, how='inner', left_index=True, right_index=True)

    return df_preds.apply(lambda row: __get_ap(row["y_true"], row["y_pred"]), axis=1)

In [4]:
def mean_avg_precision(y_true, y_pred, check_diff=True):
    
    assert len(y_pred[y_pred.map(len) != 20]) == 0, "Row with non 20 recomendations"
    if check_diff:
        assert len(y_pred[y_pred.map(lambda x: len(set(x)) != 20)]) == 0, "Row with repeated recomendations"
    
    ap_list = avg_precision(y_true, y_pred)
    return sum(ap_list) / len(ap_list)