# Accuracy Sweep (imp-als-implicit)

## Configuration

In [None]:
from pathlib import Path

In [None]:
work_dir = Path('../work')

In [None]:
sweep_name = 'imp-als-implicit'

In [None]:
data_sfx = '-imp'

In [None]:
attrs = ['factors']

## Environment Setup

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
from lenskit.metrics import topn

## Collect Run Info

In [None]:
run_files = work_dir.glob(f'*/sweep-{sweep_name}/runs.parquet')
runs = pd.concat((pd.read_parquet(rf) for rf in run_files), ignore_index=True)
runs['DataSet'] = runs.DataSet.astype('category')
runs.head()

In [None]:
meta = runs[['DataSet', 'RunId', 'Partition'] + attrs]
meta = meta.set_index(['DataSet', 'RunId'])
meta.head()

In [None]:
datasets = runs.DataSet.cat.categories
datasets

In [None]:
def ds_dir(ds):
    return work_dir / ds

In [None]:
def sweep_dir(ds):
    return ds_dir(ds) / f'sweep-{sweep_name}'

## Load Test Data

Before we can compute true accuracies, we want to load the test data and compute our ideal DCGs.

In [None]:
def load_and_summarize_test_file(ds, file):
    df = pd.read_parquet(file)
    df = df.assign(DataSet=ds)
    cols = ['DataSet', 'user', 'item']
    if 'rating' in df.columns:
        cols.append('rating')
    df = df[cols]
    return topn.compute_ideal_dcgs(df)

In [None]:
ideal_dcg = pd.concat((load_and_summarize_test_file(ds, file)
                       for ds in datasets for file in ds_dir(ds).glob(f'tune-*-test{data_sfx}.parquet')),
                      ignore_index=True)
ideal_dcg.set_index(['DataSet', 'user'], inplace=True)
ideal_dcg.head()

## Load and process rec lists

In [None]:
def load_recs(ds):
    df = pd.read_parquet(sweep_dir(ds) / 'recommendations.parquet')
    return df.assign(DataSet=ds)

In [None]:
recs = pd.concat([load_recs(ds) for ds in datasets],
                 ignore_index=True)
recs.head()

Now we compute the per-user DCG:

In [None]:
user_dcg = recs.groupby(['DataSet', 'RunId', 'user']).rating.agg(topn.dcg)
user_dcg = user_dcg.reset_index(name='DCG')
user_dcg.head()

In [None]:
user_dcg = user_dcg.join(meta, on=['DataSet', 'RunId'])
user_dcg.head()

In [None]:
user_ndcg = user_dcg.join(ideal_dcg, on=['DataSet', 'user'])
user_ndcg['nDCG'] = user_ndcg['DCG'] / user_ndcg['ideal_dcg']
user_ndcg.head()

In [None]:
agg_ndcg = user_ndcg.groupby(['DataSet'] + attrs).nDCG.mean()
agg_ndcg = agg_ndcg.reset_index()
agg_ndcg.head()

In [None]:
props = {'x': attrs[0]}
if len(attrs) > 1:
    props['hue'] = attrs[1]
sns.relplot(y="nDCG", col="DataSet", data=agg_ndcg, kind="line", **props)